diff options
author | Yaco <franco@reevo.org> | 2020-06-04 11:01:00 -0300 |
---|---|---|
committer | Yaco <franco@reevo.org> | 2020-06-04 11:01:00 -0300 |
commit | fc7369835258467bf97eb64f184b93691f9a9fd5 (patch) | |
tree | daabd60089d2dd76d9f5fb416b005fbe159c799d /www/wiki/includes/parser |
first commit
Diffstat (limited to 'www/wiki/includes/parser')
18 files changed, 19933 insertions, 0 deletions
diff --git a/www/wiki/includes/parser/BlockLevelPass.php b/www/wiki/includes/parser/BlockLevelPass.php new file mode 100644 index 00000000..1173dd20 --- /dev/null +++ b/www/wiki/includes/parser/BlockLevelPass.php @@ -0,0 +1,572 @@ +<?php + +/** + * This is the part of the wikitext parser which handles automatic paragraphs + * and conversion of start-of-line prefixes to HTML lists. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ +class BlockLevelPass { + private $DTopen = false; + private $inPre = false; + private $lastSection = ''; + private $lineStart; + private $text; + + # State constants for the definition list colon extraction + const COLON_STATE_TEXT = 0; + const COLON_STATE_TAG = 1; + const COLON_STATE_TAGSTART = 2; + const COLON_STATE_CLOSETAG = 3; + const COLON_STATE_TAGSLASH = 4; + const COLON_STATE_COMMENT = 5; + const COLON_STATE_COMMENTDASH = 6; + const COLON_STATE_COMMENTDASHDASH = 7; + const COLON_STATE_LC = 8; + + /** + * Make lists from lines starting with ':', '*', '#', etc. + * + * @param string $text + * @param bool $lineStart Whether or not this is at the start of a line. + * @return string The lists rendered as HTML + */ + public static function doBlockLevels( $text, $lineStart ) { + $pass = new self( $text, $lineStart ); + return $pass->execute(); + } + + /** + * Private constructor + */ + private function __construct( $text, $lineStart ) { + $this->text = $text; + $this->lineStart = $lineStart; + } + + /** + * If a pre or p is open, return the corresponding close tag and update + * the state. If no tag is open, return an empty string. + * @return string + */ + private function closeParagraph() { + $result = ''; + if ( $this->lastSection !== '' ) { + $result = '</' . $this->lastSection . ">\n"; + } + $this->inPre = false; + $this->lastSection = ''; + return $result; + } + + /** + * getCommon() returns the length of the longest common substring + * of both arguments, starting at the beginning of both. + * + * @param string $st1 + * @param string $st2 + * + * @return int + */ + private function getCommon( $st1, $st2 ) { + $shorter = min( strlen( $st1 ), strlen( $st2 ) ); + + for ( $i = 0; $i < $shorter; ++$i ) { + if ( $st1[$i] !== $st2[$i] ) { + break; + } + } + return $i; + } + + /** + * Open the list item element identified by the prefix character. + * + * @param string $char + * + * @return string + */ + private function openList( $char ) { + $result = $this->closeParagraph(); + + if ( '*' === $char ) { + $result .= "<ul><li>"; + } elseif ( '#' === $char ) { + $result .= "<ol><li>"; + } elseif ( ':' === $char ) { + $result .= "<dl><dd>"; + } elseif ( ';' === $char ) { + $result .= "<dl><dt>"; + $this->DTopen = true; + } else { + $result = '<!-- ERR 1 -->'; + } + + return $result; + } + + /** + * Close the current list item and open the next one. + * @param string $char + * + * @return string + */ + private function nextItem( $char ) { + if ( '*' === $char || '#' === $char ) { + return "</li>\n<li>"; + } elseif ( ':' === $char || ';' === $char ) { + $close = "</dd>\n"; + if ( $this->DTopen ) { + $close = "</dt>\n"; + } + if ( ';' === $char ) { + $this->DTopen = true; + return $close . '<dt>'; + } else { + $this->DTopen = false; + return $close . '<dd>'; + } + } + return '<!-- ERR 2 -->'; + } + + /** + * Close the current list item identified by the prefix character. + * @param string $char + * + * @return string + */ + private function closeList( $char ) { + if ( '*' === $char ) { + $text = "</li></ul>"; + } elseif ( '#' === $char ) { + $text = "</li></ol>"; + } elseif ( ':' === $char ) { + if ( $this->DTopen ) { + $this->DTopen = false; + $text = "</dt></dl>"; + } else { + $text = "</dd></dl>"; + } + } else { + return '<!-- ERR 3 -->'; + } + return $text; + } + + /** + * Execute the pass. + * @return string + */ + private function execute() { + $text = $this->text; + # Parsing through the text line by line. The main thing + # happening here is handling of block-level elements p, pre, + # and making lists from lines starting with * # : etc. + $textLines = StringUtils::explode( "\n", $text ); + + $lastPrefix = $output = ''; + $this->DTopen = $inBlockElem = false; + $prefixLength = 0; + $pendingPTag = false; + $inBlockquote = false; + + foreach ( $textLines as $inputLine ) { + # Fix up $lineStart + if ( !$this->lineStart ) { + $output .= $inputLine; + $this->lineStart = true; + continue; + } + # * = ul + # # = ol + # ; = dt + # : = dd + + $lastPrefixLength = strlen( $lastPrefix ); + $preCloseMatch = preg_match( '/<\\/pre/i', $inputLine ); + $preOpenMatch = preg_match( '/<pre/i', $inputLine ); + # If not in a <pre> element, scan for and figure out what prefixes are there. + if ( !$this->inPre ) { + # Multiple prefixes may abut each other for nested lists. + $prefixLength = strspn( $inputLine, '*#:;' ); + $prefix = substr( $inputLine, 0, $prefixLength ); + + # eh? + # ; and : are both from definition-lists, so they're equivalent + # for the purposes of determining whether or not we need to open/close + # elements. + $prefix2 = str_replace( ';', ':', $prefix ); + $t = substr( $inputLine, $prefixLength ); + $this->inPre = (bool)$preOpenMatch; + } else { + # Don't interpret any other prefixes in preformatted text + $prefixLength = 0; + $prefix = $prefix2 = ''; + $t = $inputLine; + } + + # List generation + if ( $prefixLength && $lastPrefix === $prefix2 ) { + # Same as the last item, so no need to deal with nesting or opening stuff + $output .= $this->nextItem( substr( $prefix, -1 ) ); + $pendingPTag = false; + + if ( substr( $prefix, -1 ) === ';' ) { + # The one nasty exception: definition lists work like this: + # ; title : definition text + # So we check for : in the remainder text to split up the + # title and definition, without b0rking links. + $term = $t2 = ''; + if ( $this->findColonNoLinks( $t, $term, $t2 ) !== false ) { + $t = $t2; + // Trim whitespace in list items + $output .= trim( $term ) . $this->nextItem( ':' ); + } + } + } elseif ( $prefixLength || $lastPrefixLength ) { + # We need to open or close prefixes, or both. + + # Either open or close a level... + $commonPrefixLength = $this->getCommon( $prefix, $lastPrefix ); + $pendingPTag = false; + + # Close all the prefixes which aren't shared. + while ( $commonPrefixLength < $lastPrefixLength ) { + $output .= $this->closeList( $lastPrefix[$lastPrefixLength - 1] ); + --$lastPrefixLength; + } + + # Continue the current prefix if appropriate. + if ( $prefixLength <= $commonPrefixLength && $commonPrefixLength > 0 ) { + $output .= $this->nextItem( $prefix[$commonPrefixLength - 1] ); + } + + # Close an open <dt> if we have a <dd> (":") starting on this line + if ( $this->DTopen && $commonPrefixLength > 0 && $prefix[$commonPrefixLength - 1] === ':' ) { + $output .= $this->nextItem( ':' ); + } + + # Open prefixes where appropriate. + if ( $lastPrefix && $prefixLength > $commonPrefixLength ) { + $output .= "\n"; + } + while ( $prefixLength > $commonPrefixLength ) { + $char = $prefix[$commonPrefixLength]; + $output .= $this->openList( $char ); + + if ( ';' === $char ) { + # @todo FIXME: This is dupe of code above + if ( $this->findColonNoLinks( $t, $term, $t2 ) !== false ) { + $t = $t2; + // Trim whitespace in list items + $output .= trim( $term ) . $this->nextItem( ':' ); + } + } + ++$commonPrefixLength; + } + if ( !$prefixLength && $lastPrefix ) { + $output .= "\n"; + } + $lastPrefix = $prefix2; + } + + # If we have no prefixes, go to paragraph mode. + if ( 0 == $prefixLength ) { + # No prefix (not in list)--go to paragraph mode + # @todo consider using a stack for nestable elements like span, table and div + $openMatch = preg_match( + '/(?:<table|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|' + . '<p|<ul|<ol|<dl|<li|<\\/tr|<\\/td|<\\/th)\\b/iS', + $t + ); + $closeMatch = preg_match( + '/(?:<\\/table|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|' + . '<td|<th|<\\/?blockquote|<\\/?div|<hr|<\\/pre|<\\/p|<\\/mw:|' + . Parser::MARKER_PREFIX + . '-pre|<\\/li|<\\/ul|<\\/ol|<\\/dl|<\\/?center)\\b/iS', + $t + ); + + if ( $openMatch || $closeMatch ) { + $pendingPTag = false; + // Only close the paragraph if we're not inside a <pre> tag, or if + // that <pre> tag has just been opened + if ( !$this->inPre || $preOpenMatch ) { + // @todo T7718: paragraph closed + $output .= $this->closeParagraph(); + } + if ( $preOpenMatch && !$preCloseMatch ) { + $this->inPre = true; + } + $bqOffset = 0; + while ( preg_match( '/<(\\/?)blockquote[\s>]/i', $t, + $bqMatch, PREG_OFFSET_CAPTURE, $bqOffset ) + ) { + $inBlockquote = !$bqMatch[1][0]; // is this a close tag? + $bqOffset = $bqMatch[0][1] + strlen( $bqMatch[0][0] ); + } + $inBlockElem = !$closeMatch; + } elseif ( !$inBlockElem && !$this->inPre ) { + if ( ' ' == substr( $t, 0, 1 ) + && ( $this->lastSection === 'pre' || trim( $t ) != '' ) + && !$inBlockquote + ) { + # pre + if ( $this->lastSection !== 'pre' ) { + $pendingPTag = false; + $output .= $this->closeParagraph() . '<pre>'; + $this->lastSection = 'pre'; + } + $t = substr( $t, 1 ); + } elseif ( preg_match( '/^(?:<style\\b[^>]*>.*?<\\/style>\s*|<link\\b[^>]*>\s*)+$/iS', $t ) ) { + # T186965: <style> or <link> by itself on a line shouldn't open or close paragraphs. + # But it should clear $pendingPTag. + if ( $pendingPTag ) { + $output .= $this->closeParagraph(); + $pendingPTag = false; + $this->lastSection = ''; + } + } else { + # paragraph + if ( trim( $t ) === '' ) { + if ( $pendingPTag ) { + $output .= $pendingPTag . '<br />'; + $pendingPTag = false; + $this->lastSection = 'p'; + } else { + if ( $this->lastSection !== 'p' ) { + $output .= $this->closeParagraph(); + $this->lastSection = ''; + $pendingPTag = '<p>'; + } else { + $pendingPTag = '</p><p>'; + } + } + } else { + if ( $pendingPTag ) { + $output .= $pendingPTag; + $pendingPTag = false; + $this->lastSection = 'p'; + } elseif ( $this->lastSection !== 'p' ) { + $output .= $this->closeParagraph() . '<p>'; + $this->lastSection = 'p'; + } + } + } + } + } + # somewhere above we forget to get out of pre block (T2785) + if ( $preCloseMatch && $this->inPre ) { + $this->inPre = false; + } + if ( $pendingPTag === false ) { + if ( $prefixLength === 0 ) { + $output .= $t; + $output .= "\n"; + } else { + // Trim whitespace in list items + $output .= trim( $t ); + } + } + } + while ( $prefixLength ) { + $output .= $this->closeList( $prefix2[$prefixLength - 1] ); + --$prefixLength; + if ( !$prefixLength ) { + $output .= "\n"; + } + } + if ( $this->lastSection !== '' ) { + $output .= '</' . $this->lastSection . '>'; + $this->lastSection = ''; + } + + return $output; + } + + /** + * Split up a string on ':', ignoring any occurrences inside tags + * to prevent illegal overlapping. + * + * @param string $str The string to split + * @param string &$before Set to everything before the ':' + * @param string &$after Set to everything after the ':' + * @throws MWException + * @return string The position of the ':', or false if none found + */ + private function findColonNoLinks( $str, &$before, &$after ) { + if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE ) ) { + # Nothing to find! + return false; + } + + if ( $m[0][0] === ':' ) { + # Easy; no tag nesting to worry about + $colonPos = $m[0][1]; + $before = substr( $str, 0, $colonPos ); + $after = substr( $str, $colonPos + 1 ); + return $colonPos; + } + + # Ugly state machine to walk through avoiding tags. + $state = self::COLON_STATE_TEXT; + $ltLevel = 0; + $lcLevel = 0; + $len = strlen( $str ); + for ( $i = $m[0][1]; $i < $len; $i++ ) { + $c = $str[$i]; + + switch ( $state ) { + case self::COLON_STATE_TEXT: + switch ( $c ) { + case "<": + # Could be either a <start> tag or an </end> tag + $state = self::COLON_STATE_TAGSTART; + break; + case ":": + if ( $ltLevel === 0 ) { + # We found it! + $before = substr( $str, 0, $i ); + $after = substr( $str, $i + 1 ); + return $i; + } + # Embedded in a tag; don't break it. + break; + default: + # Skip ahead looking for something interesting + if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) { + # Nothing else interesting + return false; + } + if ( $m[0][0] === '-{' ) { + $state = self::COLON_STATE_LC; + $lcLevel++; + $i = $m[0][1] + 1; + } else { + # Skip ahead to next interesting character. + $i = $m[0][1] - 1; + } + break; + } + break; + case self::COLON_STATE_LC: + # In language converter markup -{ ... }- + if ( !preg_match( '/-\{|\}-/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) { + # Nothing else interesting to find; abort! + # We're nested in language converter markup, but there + # are no close tags left. Abort! + break 2; + } elseif ( $m[0][0] === '-{' ) { + $i = $m[0][1] + 1; + $lcLevel++; + } elseif ( $m[0][0] === '}-' ) { + $i = $m[0][1] + 1; + $lcLevel--; + if ( $lcLevel === 0 ) { + $state = self::COLON_STATE_TEXT; + } + } + break; + case self::COLON_STATE_TAG: + # In a <tag> + switch ( $c ) { + case ">": + $ltLevel++; + $state = self::COLON_STATE_TEXT; + break; + case "/": + # Slash may be followed by >? + $state = self::COLON_STATE_TAGSLASH; + break; + default: + # ignore + } + break; + case self::COLON_STATE_TAGSTART: + switch ( $c ) { + case "/": + $state = self::COLON_STATE_CLOSETAG; + break; + case "!": + $state = self::COLON_STATE_COMMENT; + break; + case ">": + # Illegal early close? This shouldn't happen D: + $state = self::COLON_STATE_TEXT; + break; + default: + $state = self::COLON_STATE_TAG; + } + break; + case self::COLON_STATE_CLOSETAG: + # In a </tag> + if ( $c === ">" ) { + if ( $ltLevel > 0 ) { + $ltLevel--; + } else { + # ignore the excess close tag, but keep looking for + # colons. (This matches Parsoid behavior.) + wfDebug( __METHOD__ . ": Invalid input; too many close tags\n" ); + } + $state = self::COLON_STATE_TEXT; + } + break; + case self::COLON_STATE_TAGSLASH: + if ( $c === ">" ) { + # Yes, a self-closed tag <blah/> + $state = self::COLON_STATE_TEXT; + } else { + # Probably we're jumping the gun, and this is an attribute + $state = self::COLON_STATE_TAG; + } + break; + case self::COLON_STATE_COMMENT: + if ( $c === "-" ) { + $state = self::COLON_STATE_COMMENTDASH; + } + break; + case self::COLON_STATE_COMMENTDASH: + if ( $c === "-" ) { + $state = self::COLON_STATE_COMMENTDASHDASH; + } else { + $state = self::COLON_STATE_COMMENT; + } + break; + case self::COLON_STATE_COMMENTDASHDASH: + if ( $c === ">" ) { + $state = self::COLON_STATE_TEXT; + } else { + $state = self::COLON_STATE_COMMENT; + } + break; + default: + throw new MWException( "State machine error in " . __METHOD__ ); + } + } + if ( $ltLevel > 0 || $lcLevel > 0 ) { + wfDebug( + __METHOD__ . ": Invalid input; not enough close tags " . + "(level $ltLevel/$lcLevel, state $state)\n" + ); + return false; + } + return false; + } +} diff --git a/www/wiki/includes/parser/CacheTime.php b/www/wiki/includes/parser/CacheTime.php new file mode 100644 index 00000000..05bcebef --- /dev/null +++ b/www/wiki/includes/parser/CacheTime.php @@ -0,0 +1,175 @@ +<?php +/** + * Parser cache specific expiry check. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ + +/** + * Parser cache specific expiry check. + * + * @ingroup Parser + */ +class CacheTime { + /** @var array|bool ParserOptions which have been taken into account to + * produce output or false if not available. + */ + public $mUsedOptions; + + # Compatibility check + public $mVersion = Parser::VERSION; + + # Time when this object was generated, or -1 for uncacheable. Used in ParserCache. + public $mCacheTime = ''; + + # Seconds after which the object should expire, use 0 for uncacheable. Used in ParserCache. + public $mCacheExpiry = null; + + # Revision ID that was parsed + public $mCacheRevisionId = null; + + /** + * @return string TS_MW timestamp + */ + public function getCacheTime() { + return wfTimestamp( TS_MW, $this->mCacheTime ); + } + + /** + * setCacheTime() sets the timestamp expressing when the page has been rendered. + * This does not control expiry, see updateCacheExpiry() for that! + * @param string $t TS_MW timestamp + * @return string + */ + public function setCacheTime( $t ) { + return wfSetVar( $this->mCacheTime, $t ); + } + + /** + * @since 1.23 + * @return int|null Revision id, if any was set + */ + public function getCacheRevisionId() { + return $this->mCacheRevisionId; + } + + /** + * @since 1.23 + * @param int $id Revision id + */ + public function setCacheRevisionId( $id ) { + $this->mCacheRevisionId = $id; + } + + /** + * Sets the number of seconds after which this object should expire. + * + * This value is used with the ParserCache. + * If called with a value greater than the value provided at any previous call, + * the new call has no effect. The value returned by getCacheExpiry is smaller + * or equal to the smallest number that was provided as an argument to + * updateCacheExpiry(). + * + * Avoid using 0 if at all possible. Consider JavaScript for highly dynamic content. + * + * @param int $seconds + */ + public function updateCacheExpiry( $seconds ) { + $seconds = (int)$seconds; + + if ( $this->mCacheExpiry === null || $this->mCacheExpiry > $seconds ) { + $this->mCacheExpiry = $seconds; + } + } + + /** + * Returns the number of seconds after which this object should expire. + * This method is used by ParserCache to determine how long the ParserOutput can be cached. + * The timestamp of expiry can be calculated by adding getCacheExpiry() to getCacheTime(). + * The value returned by getCacheExpiry is smaller or equal to the smallest number + * that was provided to a call of updateCacheExpiry(), and smaller or equal to the + * value of $wgParserCacheExpireTime. + * @return int|mixed|null + */ + public function getCacheExpiry() { + global $wgParserCacheExpireTime; + + if ( $this->mCacheTime < 0 ) { + return 0; + } // old-style marker for "not cacheable" + + $expire = $this->mCacheExpiry; + + if ( $expire === null ) { + $expire = $wgParserCacheExpireTime; + } else { + $expire = min( $expire, $wgParserCacheExpireTime ); + } + + if ( $expire <= 0 ) { + return 0; // not cacheable + } else { + return $expire; + } + } + + /** + * @return bool + */ + public function isCacheable() { + return $this->getCacheExpiry() > 0; + } + + /** + * Return true if this cached output object predates the global or + * per-article cache invalidation timestamps, or if it comes from + * an incompatible older version. + * + * @param string $touched The affected article's last touched timestamp + * @return bool + */ + public function expired( $touched ) { + global $wgCacheEpoch; + + return !$this->isCacheable() // parser says it's uncacheable + || $this->getCacheTime() < $touched + || $this->getCacheTime() <= $wgCacheEpoch + || $this->getCacheTime() < + wfTimestamp( TS_MW, time() - $this->getCacheExpiry() ) // expiry period has passed + || !isset( $this->mVersion ) + || version_compare( $this->mVersion, Parser::VERSION, "lt" ); + } + + /** + * Return true if this cached output object is for a different revision of + * the page. + * + * @todo We always return false if $this->getCacheRevisionId() is null; + * this prevents invalidating the whole parser cache when this change is + * deployed. Someday that should probably be changed. + * + * @since 1.23 + * @param int $id The affected article's current revision id + * @return bool + */ + public function isDifferentRevision( $id ) { + $cached = $this->getCacheRevisionId(); + return $cached !== null && $id !== $cached; + } +} diff --git a/www/wiki/includes/parser/CoreParserFunctions.php b/www/wiki/includes/parser/CoreParserFunctions.php new file mode 100644 index 00000000..0e30b3c8 --- /dev/null +++ b/www/wiki/includes/parser/CoreParserFunctions.php @@ -0,0 +1,1352 @@ +<?php +/** + * Parser functions provided by MediaWiki core + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ +use MediaWiki\MediaWikiServices; + +/** + * Various core parser functions, registered in Parser::firstCallInit() + * @ingroup Parser + */ +class CoreParserFunctions { + /** + * @param Parser $parser + * @return void + */ + public static function register( $parser ) { + global $wgAllowDisplayTitle, $wgAllowSlowParserFunctions; + + # Syntax for arguments (see Parser::setFunctionHook): + # "name for lookup in localized magic words array", + # function callback, + # optional Parser::SFH_NO_HASH to omit the hash from calls (e.g. {{int:...}} + # instead of {{#int:...}}) + $noHashFunctions = [ + 'ns', 'nse', 'urlencode', 'lcfirst', 'ucfirst', 'lc', 'uc', + 'localurl', 'localurle', 'fullurl', 'fullurle', 'canonicalurl', + 'canonicalurle', 'formatnum', 'grammar', 'gender', 'plural', 'bidi', + 'numberofpages', 'numberofusers', 'numberofactiveusers', + 'numberofarticles', 'numberoffiles', 'numberofadmins', + 'numberingroup', 'numberofedits', 'language', + 'padleft', 'padright', 'anchorencode', 'defaultsort', 'filepath', + 'pagesincategory', 'pagesize', 'protectionlevel', 'protectionexpiry', + 'namespacee', 'namespacenumber', 'talkspace', 'talkspacee', + 'subjectspace', 'subjectspacee', 'pagename', 'pagenamee', + 'fullpagename', 'fullpagenamee', 'rootpagename', 'rootpagenamee', + 'basepagename', 'basepagenamee', 'subpagename', 'subpagenamee', + 'talkpagename', 'talkpagenamee', 'subjectpagename', + 'subjectpagenamee', 'pageid', 'revisionid', 'revisionday', + 'revisionday2', 'revisionmonth', 'revisionmonth1', 'revisionyear', + 'revisiontimestamp', 'revisionuser', 'cascadingsources', + ]; + foreach ( $noHashFunctions as $func ) { + $parser->setFunctionHook( $func, [ __CLASS__, $func ], Parser::SFH_NO_HASH ); + } + + $parser->setFunctionHook( + 'namespace', + [ __CLASS__, 'mwnamespace' ], + Parser::SFH_NO_HASH + ); + $parser->setFunctionHook( 'int', [ __CLASS__, 'intFunction' ], Parser::SFH_NO_HASH ); + $parser->setFunctionHook( 'special', [ __CLASS__, 'special' ] ); + $parser->setFunctionHook( 'speciale', [ __CLASS__, 'speciale' ] ); + $parser->setFunctionHook( 'tag', [ __CLASS__, 'tagObj' ], Parser::SFH_OBJECT_ARGS ); + $parser->setFunctionHook( 'formatdate', [ __CLASS__, 'formatDate' ] ); + + if ( $wgAllowDisplayTitle ) { + $parser->setFunctionHook( + 'displaytitle', + [ __CLASS__, 'displaytitle' ], + Parser::SFH_NO_HASH + ); + } + if ( $wgAllowSlowParserFunctions ) { + $parser->setFunctionHook( + 'pagesinnamespace', + [ __CLASS__, 'pagesinnamespace' ], + Parser::SFH_NO_HASH + ); + } + } + + /** + * @param Parser $parser + * @param string $part1 + * @return array + */ + public static function intFunction( $parser, $part1 = '' /*, ... */ ) { + if ( strval( $part1 ) !== '' ) { + $args = array_slice( func_get_args(), 2 ); + $message = wfMessage( $part1, $args ) + ->inLanguage( $parser->getOptions()->getUserLangObj() ); + if ( !$message->exists() ) { + // When message does not exists, the message name is surrounded by angle + // and can result in a tag, therefore escape the angles + return $message->escaped(); + } + return [ $message->plain(), 'noparse' => false ]; + } else { + return [ 'found' => false ]; + } + } + + /** + * @param Parser $parser + * @param string $date + * @param string $defaultPref + * + * @return string + */ + public static function formatDate( $parser, $date, $defaultPref = null ) { + $lang = $parser->getFunctionLang(); + $df = DateFormatter::getInstance( $lang ); + + $date = trim( $date ); + + $pref = $parser->getOptions()->getDateFormat(); + + // Specify a different default date format other than the normal default + // if the user has 'default' for their setting + if ( $pref == 'default' && $defaultPref ) { + $pref = $defaultPref; + } + + $date = $df->reformat( $pref, $date, [ 'match-whole' ] ); + return $date; + } + + public static function ns( $parser, $part1 = '' ) { + global $wgContLang; + if ( intval( $part1 ) || $part1 == "0" ) { + $index = intval( $part1 ); + } else { + $index = $wgContLang->getNsIndex( str_replace( ' ', '_', $part1 ) ); + } + if ( $index !== false ) { + return $wgContLang->getFormattedNsText( $index ); + } else { + return [ 'found' => false ]; + } + } + + public static function nse( $parser, $part1 = '' ) { + $ret = self::ns( $parser, $part1 ); + if ( is_string( $ret ) ) { + $ret = wfUrlencode( str_replace( ' ', '_', $ret ) ); + } + return $ret; + } + + /** + * urlencodes a string according to one of three patterns: (T24474) + * + * By default (for HTTP "query" strings), spaces are encoded as '+'. + * Or to encode a value for the HTTP "path", spaces are encoded as '%20'. + * For links to "wiki"s, or similar software, spaces are encoded as '_', + * + * @param Parser $parser + * @param string $s The text to encode. + * @param string $arg (optional): The type of encoding. + * @return string + */ + public static function urlencode( $parser, $s = '', $arg = null ) { + static $magicWords = null; + if ( is_null( $magicWords ) ) { + $magicWords = new MagicWordArray( [ 'url_path', 'url_query', 'url_wiki' ] ); + } + switch ( $magicWords->matchStartToEnd( $arg ) ) { + // Encode as though it's a wiki page, '_' for ' '. + case 'url_wiki': + $func = 'wfUrlencode'; + $s = str_replace( ' ', '_', $s ); + break; + + // Encode for an HTTP Path, '%20' for ' '. + case 'url_path': + $func = 'rawurlencode'; + break; + + // Encode for HTTP query, '+' for ' '. + case 'url_query': + default: + $func = 'urlencode'; + } + // See T105242, where the choice to kill markers and various + // other options were discussed. + return $func( $parser->killMarkers( $s ) ); + } + + public static function lcfirst( $parser, $s = '' ) { + global $wgContLang; + return $wgContLang->lcfirst( $s ); + } + + public static function ucfirst( $parser, $s = '' ) { + global $wgContLang; + return $wgContLang->ucfirst( $s ); + } + + /** + * @param Parser $parser + * @param string $s + * @return string + */ + public static function lc( $parser, $s = '' ) { + global $wgContLang; + return $parser->markerSkipCallback( $s, [ $wgContLang, 'lc' ] ); + } + + /** + * @param Parser $parser + * @param string $s + * @return string + */ + public static function uc( $parser, $s = '' ) { + global $wgContLang; + return $parser->markerSkipCallback( $s, [ $wgContLang, 'uc' ] ); + } + + public static function localurl( $parser, $s = '', $arg = null ) { + return self::urlFunction( 'getLocalURL', $s, $arg ); + } + + public static function localurle( $parser, $s = '', $arg = null ) { + $temp = self::urlFunction( 'getLocalURL', $s, $arg ); + if ( !is_string( $temp ) ) { + return $temp; + } else { + return htmlspecialchars( $temp ); + } + } + + public static function fullurl( $parser, $s = '', $arg = null ) { + return self::urlFunction( 'getFullURL', $s, $arg ); + } + + public static function fullurle( $parser, $s = '', $arg = null ) { + $temp = self::urlFunction( 'getFullURL', $s, $arg ); + if ( !is_string( $temp ) ) { + return $temp; + } else { + return htmlspecialchars( $temp ); + } + } + + public static function canonicalurl( $parser, $s = '', $arg = null ) { + return self::urlFunction( 'getCanonicalURL', $s, $arg ); + } + + public static function canonicalurle( $parser, $s = '', $arg = null ) { + $temp = self::urlFunction( 'getCanonicalURL', $s, $arg ); + if ( !is_string( $temp ) ) { + return $temp; + } else { + return htmlspecialchars( $temp ); + } + } + + public static function urlFunction( $func, $s = '', $arg = null ) { + $title = Title::newFromText( $s ); + # Due to order of execution of a lot of bits, the values might be encoded + # before arriving here; if that's true, then the title can't be created + # and the variable will fail. If we can't get a decent title from the first + # attempt, url-decode and try for a second. + if ( is_null( $title ) ) { + $title = Title::newFromURL( urldecode( $s ) ); + } + if ( !is_null( $title ) ) { + # Convert NS_MEDIA -> NS_FILE + if ( $title->inNamespace( NS_MEDIA ) ) { + $title = Title::makeTitle( NS_FILE, $title->getDBkey() ); + } + if ( !is_null( $arg ) ) { + $text = $title->$func( $arg ); + } else { + $text = $title->$func(); + } + return $text; + } else { + return [ 'found' => false ]; + } + } + + /** + * @param Parser $parser + * @param string $num + * @param string $arg + * @return string + */ + public static function formatnum( $parser, $num = '', $arg = null ) { + if ( self::matchAgainstMagicword( 'rawsuffix', $arg ) ) { + $func = [ $parser->getFunctionLang(), 'parseFormattedNumber' ]; + } elseif ( self::matchAgainstMagicword( 'nocommafysuffix', $arg ) ) { + $func = [ $parser->getFunctionLang(), 'formatNumNoSeparators' ]; + } else { + $func = [ $parser->getFunctionLang(), 'formatNum' ]; + } + return $parser->markerSkipCallback( $num, $func ); + } + + /** + * @param Parser $parser + * @param string $case + * @param string $word + * @return string + */ + public static function grammar( $parser, $case = '', $word = '' ) { + $word = $parser->killMarkers( $word ); + return $parser->getFunctionLang()->convertGrammar( $word, $case ); + } + + /** + * @param Parser $parser + * @param string $username + * @return string + */ + public static function gender( $parser, $username ) { + $forms = array_slice( func_get_args(), 2 ); + + // Some shortcuts to avoid loading user data unnecessarily + if ( count( $forms ) === 0 ) { + return ''; + } elseif ( count( $forms ) === 1 ) { + return $forms[0]; + } + + $username = trim( $username ); + + // default + $gender = User::getDefaultOption( 'gender' ); + + // allow prefix and normalize (e.g. "*foo" -> "*foo" ). + $title = Title::newFromText( $username, NS_USER ); + + if ( $title && $title->inNamespace( NS_USER ) ) { + $username = $title->getText(); + } + + // check parameter, or use the ParserOptions if in interface message + $user = User::newFromName( $username ); + $genderCache = MediaWikiServices::getInstance()->getGenderCache(); + if ( $user ) { + $gender = $genderCache->getGenderOf( $user, __METHOD__ ); + } elseif ( $username === '' && $parser->getOptions()->getInterfaceMessage() ) { + $gender = $genderCache->getGenderOf( $parser->getOptions()->getUser(), __METHOD__ ); + } + $ret = $parser->getFunctionLang()->gender( $gender, $forms ); + return $ret; + } + + /** + * @param Parser $parser + * @param string $text + * @return string + */ + public static function plural( $parser, $text = '' ) { + $forms = array_slice( func_get_args(), 2 ); + $text = $parser->getFunctionLang()->parseFormattedNumber( $text ); + settype( $text, ctype_digit( $text ) ? 'int' : 'float' ); + return $parser->getFunctionLang()->convertPlural( $text, $forms ); + } + + /** + * @param Parser $parser + * @param string $text + * @return string + */ + public static function bidi( $parser, $text = '' ) { + return $parser->getFunctionLang()->embedBidi( $text ); + } + + /** + * Override the title of the page when viewed, provided we've been given a + * title which will normalise to the canonical title + * + * @param Parser $parser Parent parser + * @param string $text Desired title text + * @param string $uarg + * @return string + */ + public static function displaytitle( $parser, $text = '', $uarg = '' ) { + global $wgRestrictDisplayTitle; + + static $magicWords = null; + if ( is_null( $magicWords ) ) { + $magicWords = new MagicWordArray( [ 'displaytitle_noerror', 'displaytitle_noreplace' ] ); + } + $arg = $magicWords->matchStartToEnd( $uarg ); + + // parse a limited subset of wiki markup (just the single quote items) + $text = $parser->doQuotes( $text ); + + // remove stripped text (e.g. the UNIQ-QINU stuff) that was generated by tag extensions/whatever + $text = $parser->killMarkers( $text ); + + // list of disallowed tags for DISPLAYTITLE + // these will be escaped even though they are allowed in normal wiki text + $bad = [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'blockquote', 'ol', 'ul', 'li', 'hr', + 'table', 'tr', 'th', 'td', 'dl', 'dd', 'caption', 'p', 'ruby', 'rb', 'rt', 'rtc', 'rp', 'br' ]; + + // disallow some styles that could be used to bypass $wgRestrictDisplayTitle + if ( $wgRestrictDisplayTitle ) { + $htmlTagsCallback = function ( &$params ) { + $decoded = Sanitizer::decodeTagAttributes( $params ); + + if ( isset( $decoded['style'] ) ) { + // this is called later anyway, but we need it right now for the regexes below to be safe + // calling it twice doesn't hurt + $decoded['style'] = Sanitizer::checkCss( $decoded['style'] ); + + if ( preg_match( '/(display|user-select|visibility)\s*:/i', $decoded['style'] ) ) { + $decoded['style'] = '/* attempt to bypass $wgRestrictDisplayTitle */'; + } + } + + $params = Sanitizer::safeEncodeTagAttributes( $decoded ); + }; + } else { + $htmlTagsCallback = null; + } + + // only requested titles that normalize to the actual title are allowed through + // if $wgRestrictDisplayTitle is true (it is by default) + // mimic the escaping process that occurs in OutputPage::setPageTitle + $text = Sanitizer::normalizeCharReferences( Sanitizer::removeHTMLtags( + $text, + $htmlTagsCallback, + [], + [], + $bad + ) ); + $title = Title::newFromText( Sanitizer::stripAllTags( $text ) ); + + if ( !$wgRestrictDisplayTitle || + ( $title instanceof Title + && !$title->hasFragment() + && $title->equals( $parser->mTitle ) ) + ) { + $old = $parser->mOutput->getProperty( 'displaytitle' ); + if ( $old === false || $arg !== 'displaytitle_noreplace' ) { + $parser->mOutput->setDisplayTitle( $text ); + } + if ( $old !== false && $old !== $text && !$arg ) { + $converter = $parser->getConverterLanguage()->getConverter(); + return '<span class="error">' . + wfMessage( 'duplicate-displaytitle', + // Message should be parsed, but these params should only be escaped. + $converter->markNoConversion( wfEscapeWikiText( $old ) ), + $converter->markNoConversion( wfEscapeWikiText( $text ) ) + )->inContentLanguage()->text() . + '</span>'; + } else { + return ''; + } + } else { + $converter = $parser->getConverterLanguage()->getConverter(); + $parser->getOutput()->addWarning( + wfMessage( 'restricted-displaytitle', + // Message should be parsed, but this param should only be escaped. + $converter->markNoConversion( wfEscapeWikiText( $text ) ) + )->text() + ); + $parser->addTrackingCategory( 'restricted-displaytitle-ignored' ); + } + } + + /** + * Matches the given value against the value of given magic word + * + * @param string $magicword Magic word key + * @param string $value Value to match + * @return bool True on successful match + */ + private static function matchAgainstMagicword( $magicword, $value ) { + $value = trim( strval( $value ) ); + if ( $value === '' ) { + return false; + } + $mwObject = MagicWord::get( $magicword ); + return $mwObject->matchStartToEnd( $value ); + } + + /** + * Formats a number according to a language. + * + * @param int|float $num + * @param string $raw + * @param Language|StubUserLang $language + * @return string + */ + public static function formatRaw( $num, $raw, $language ) { + if ( self::matchAgainstMagicword( 'rawsuffix', $raw ) ) { + return $num; + } else { + return $language->formatNum( $num ); + } + } + + public static function numberofpages( $parser, $raw = null ) { + return self::formatRaw( SiteStats::pages(), $raw, $parser->getFunctionLang() ); + } + + public static function numberofusers( $parser, $raw = null ) { + return self::formatRaw( SiteStats::users(), $raw, $parser->getFunctionLang() ); + } + public static function numberofactiveusers( $parser, $raw = null ) { + return self::formatRaw( SiteStats::activeUsers(), $raw, $parser->getFunctionLang() ); + } + + public static function numberofarticles( $parser, $raw = null ) { + return self::formatRaw( SiteStats::articles(), $raw, $parser->getFunctionLang() ); + } + + public static function numberoffiles( $parser, $raw = null ) { + return self::formatRaw( SiteStats::images(), $raw, $parser->getFunctionLang() ); + } + + public static function numberofadmins( $parser, $raw = null ) { + return self::formatRaw( + SiteStats::numberingroup( 'sysop' ), + $raw, + $parser->getFunctionLang() + ); + } + + public static function numberofedits( $parser, $raw = null ) { + return self::formatRaw( SiteStats::edits(), $raw, $parser->getFunctionLang() ); + } + + public static function pagesinnamespace( $parser, $namespace = 0, $raw = null ) { + return self::formatRaw( + SiteStats::pagesInNs( intval( $namespace ) ), + $raw, + $parser->getFunctionLang() + ); + } + public static function numberingroup( $parser, $name = '', $raw = null ) { + return self::formatRaw( + SiteStats::numberingroup( strtolower( $name ) ), + $raw, + $parser->getFunctionLang() + ); + } + + /** + * Given a title, return the namespace name that would be given by the + * corresponding magic word + * Note: function name changed to "mwnamespace" rather than "namespace" + * to not break PHP 5.3 + * @param Parser $parser + * @param string $title + * @return mixed|string + */ + public static function mwnamespace( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return str_replace( '_', ' ', $t->getNsText() ); + } + public static function namespacee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return wfUrlencode( $t->getNsText() ); + } + public static function namespacenumber( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return $t->getNamespace(); + } + public static function talkspace( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) || !$t->canHaveTalkPage() ) { + return ''; + } + return str_replace( '_', ' ', $t->getTalkNsText() ); + } + public static function talkspacee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) || !$t->canHaveTalkPage() ) { + return ''; + } + return wfUrlencode( $t->getTalkNsText() ); + } + public static function subjectspace( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return str_replace( '_', ' ', $t->getSubjectNsText() ); + } + public static function subjectspacee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return wfUrlencode( $t->getSubjectNsText() ); + } + + /** + * Functions to get and normalize pagenames, corresponding to the magic words + * of the same names + * @param Parser $parser + * @param string $title + * @return string + */ + public static function pagename( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return wfEscapeWikiText( $t->getText() ); + } + public static function pagenamee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return wfEscapeWikiText( $t->getPartialURL() ); + } + public static function fullpagename( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) || !$t->canHaveTalkPage() ) { + return ''; + } + return wfEscapeWikiText( $t->getPrefixedText() ); + } + public static function fullpagenamee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) || !$t->canHaveTalkPage() ) { + return ''; + } + return wfEscapeWikiText( $t->getPrefixedURL() ); + } + public static function subpagename( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return wfEscapeWikiText( $t->getSubpageText() ); + } + public static function subpagenamee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return wfEscapeWikiText( $t->getSubpageUrlForm() ); + } + public static function rootpagename( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return wfEscapeWikiText( $t->getRootText() ); + } + public static function rootpagenamee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return wfEscapeWikiText( wfUrlencode( str_replace( ' ', '_', $t->getRootText() ) ) ); + } + public static function basepagename( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return wfEscapeWikiText( $t->getBaseText() ); + } + public static function basepagenamee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return wfEscapeWikiText( wfUrlencode( str_replace( ' ', '_', $t->getBaseText() ) ) ); + } + public static function talkpagename( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) || !$t->canHaveTalkPage() ) { + return ''; + } + return wfEscapeWikiText( $t->getTalkPage()->getPrefixedText() ); + } + public static function talkpagenamee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) || !$t->canHaveTalkPage() ) { + return ''; + } + return wfEscapeWikiText( $t->getTalkPage()->getPrefixedURL() ); + } + public static function subjectpagename( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return wfEscapeWikiText( $t->getSubjectPage()->getPrefixedText() ); + } + public static function subjectpagenamee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + return wfEscapeWikiText( $t->getSubjectPage()->getPrefixedURL() ); + } + + /** + * Return the number of pages, files or subcats in the given category, + * or 0 if it's nonexistent. This is an expensive parser function and + * can't be called too many times per page. + * @param Parser $parser + * @param string $name + * @param string $arg1 + * @param string $arg2 + * @return string + */ + public static function pagesincategory( $parser, $name = '', $arg1 = null, $arg2 = null ) { + global $wgContLang; + static $magicWords = null; + if ( is_null( $magicWords ) ) { + $magicWords = new MagicWordArray( [ + 'pagesincategory_all', + 'pagesincategory_pages', + 'pagesincategory_subcats', + 'pagesincategory_files' + ] ); + } + static $cache = []; + + // split the given option to its variable + if ( self::matchAgainstMagicword( 'rawsuffix', $arg1 ) ) { + // {{pagesincategory:|raw[|type]}} + $raw = $arg1; + $type = $magicWords->matchStartToEnd( $arg2 ); + } else { + // {{pagesincategory:[|type[|raw]]}} + $type = $magicWords->matchStartToEnd( $arg1 ); + $raw = $arg2; + } + if ( !$type ) { // backward compatibility + $type = 'pagesincategory_all'; + } + + $title = Title::makeTitleSafe( NS_CATEGORY, $name ); + if ( !$title ) { # invalid title + return self::formatRaw( 0, $raw, $parser->getFunctionLang() ); + } + $wgContLang->findVariantLink( $name, $title, true ); + + // Normalize name for cache + $name = $title->getDBkey(); + + if ( !isset( $cache[$name] ) ) { + $category = Category::newFromTitle( $title ); + + $allCount = $subcatCount = $fileCount = $pagesCount = 0; + if ( $parser->incrementExpensiveFunctionCount() ) { + // $allCount is the total number of cat members, + // not the count of how many members are normal pages. + $allCount = (int)$category->getPageCount(); + $subcatCount = (int)$category->getSubcatCount(); + $fileCount = (int)$category->getFileCount(); + $pagesCount = $allCount - $subcatCount - $fileCount; + } + $cache[$name]['pagesincategory_all'] = $allCount; + $cache[$name]['pagesincategory_pages'] = $pagesCount; + $cache[$name]['pagesincategory_subcats'] = $subcatCount; + $cache[$name]['pagesincategory_files'] = $fileCount; + } + + $count = $cache[$name][$type]; + return self::formatRaw( $count, $raw, $parser->getFunctionLang() ); + } + + /** + * Return the size of the given page, or 0 if it's nonexistent. This is an + * expensive parser function and can't be called too many times per page. + * + * @param Parser $parser + * @param string $page Name of page to check (Default: empty string) + * @param string $raw Should number be human readable with commas or just number + * @return string + */ + public static function pagesize( $parser, $page = '', $raw = null ) { + $title = Title::newFromText( $page ); + + if ( !is_object( $title ) ) { + return self::formatRaw( 0, $raw, $parser->getFunctionLang() ); + } + + // fetch revision from cache/database and return the value + $rev = self::getCachedRevisionObject( $parser, $title ); + $length = $rev ? $rev->getSize() : 0; + if ( $length === null ) { + // We've had bugs where rev_len was not being recorded for empty pages, see T135414 + $length = 0; + } + return self::formatRaw( $length, $raw, $parser->getFunctionLang() ); + } + + /** + * Returns the requested protection level for the current page. This + * is an expensive parser function and can't be called too many times + * per page, unless the protection levels/expiries for the given title + * have already been retrieved + * + * @param Parser $parser + * @param string $type + * @param string $title + * + * @return string + */ + public static function protectionlevel( $parser, $type = '', $title = '' ) { + $titleObject = Title::newFromText( $title ); + if ( !( $titleObject instanceof Title ) ) { + $titleObject = $parser->mTitle; + } + if ( $titleObject->areRestrictionsLoaded() || $parser->incrementExpensiveFunctionCount() ) { + $restrictions = $titleObject->getRestrictions( strtolower( $type ) ); + # Title::getRestrictions returns an array, its possible it may have + # multiple values in the future + return implode( ',', $restrictions ); + } + return ''; + } + + /** + * Returns the requested protection expiry for the current page. This + * is an expensive parser function and can't be called too many times + * per page, unless the protection levels/expiries for the given title + * have already been retrieved + * + * @param Parser $parser + * @param string $type + * @param string $title + * + * @return string + */ + public static function protectionexpiry( $parser, $type = '', $title = '' ) { + $titleObject = Title::newFromText( $title ); + if ( !( $titleObject instanceof Title ) ) { + $titleObject = $parser->mTitle; + } + if ( $titleObject->areRestrictionsLoaded() || $parser->incrementExpensiveFunctionCount() ) { + $expiry = $titleObject->getRestrictionExpiry( strtolower( $type ) ); + // getRestrictionExpiry() returns false on invalid type; trying to + // match protectionlevel() function that returns empty string instead + if ( $expiry === false ) { + $expiry = ''; + } + return $expiry; + } + return ''; + } + + /** + * Gives language names. + * @param Parser $parser + * @param string $code Language code (of which to get name) + * @param string $inLanguage Language code (in which to get name) + * @return string + */ + public static function language( $parser, $code = '', $inLanguage = '' ) { + $code = strtolower( $code ); + $inLanguage = strtolower( $inLanguage ); + $lang = Language::fetchLanguageName( $code, $inLanguage ); + return $lang !== '' ? $lang : LanguageCode::bcp47( $code ); + } + + /** + * Unicode-safe str_pad with the restriction that $length is forced to be <= 500 + * @param Parser $parser + * @param string $string + * @param string $length + * @param string $padding + * @param int $direction + * @return string + */ + public static function pad( + $parser, $string, $length, $padding = '0', $direction = STR_PAD_RIGHT + ) { + $padding = $parser->killMarkers( $padding ); + $lengthOfPadding = mb_strlen( $padding ); + if ( $lengthOfPadding == 0 ) { + return $string; + } + + # The remaining length to add counts down to 0 as padding is added + $length = min( (int)$length, 500 ) - mb_strlen( $string ); + if ( $length <= 0 ) { + // Nothing to add + return $string; + } + + # $finalPadding is just $padding repeated enough times so that + # mb_strlen( $string ) + mb_strlen( $finalPadding ) == $length + $finalPadding = ''; + while ( $length > 0 ) { + # If $length < $lengthofPadding, truncate $padding so we get the + # exact length desired. + $finalPadding .= mb_substr( $padding, 0, $length ); + $length -= $lengthOfPadding; + } + + if ( $direction == STR_PAD_LEFT ) { + return $finalPadding . $string; + } else { + return $string . $finalPadding; + } + } + + public static function padleft( $parser, $string = '', $length = 0, $padding = '0' ) { + return self::pad( $parser, $string, $length, $padding, STR_PAD_LEFT ); + } + + public static function padright( $parser, $string = '', $length = 0, $padding = '0' ) { + return self::pad( $parser, $string, $length, $padding ); + } + + /** + * @param Parser $parser + * @param string $text + * @return string + */ + public static function anchorencode( $parser, $text ) { + $text = $parser->killMarkers( $text ); + $section = (string)substr( $parser->guessSectionNameFromWikiText( $text ), 1 ); + return Sanitizer::safeEncodeAttribute( $section ); + } + + public static function special( $parser, $text ) { + list( $page, $subpage ) = SpecialPageFactory::resolveAlias( $text ); + if ( $page ) { + $title = SpecialPage::getTitleFor( $page, $subpage ); + return $title->getPrefixedText(); + } else { + // unknown special page, just use the given text as its title, if at all possible + $title = Title::makeTitleSafe( NS_SPECIAL, $text ); + return $title ? $title->getPrefixedText() : self::special( $parser, 'Badtitle' ); + } + } + + public static function speciale( $parser, $text ) { + return wfUrlencode( str_replace( ' ', '_', self::special( $parser, $text ) ) ); + } + + /** + * @param Parser $parser + * @param string $text The sortkey to use + * @param string $uarg Either "noreplace" or "noerror" (in en) + * both suppress errors, and noreplace does nothing if + * a default sortkey already exists. + * @return string + */ + public static function defaultsort( $parser, $text, $uarg = '' ) { + static $magicWords = null; + if ( is_null( $magicWords ) ) { + $magicWords = new MagicWordArray( [ 'defaultsort_noerror', 'defaultsort_noreplace' ] ); + } + $arg = $magicWords->matchStartToEnd( $uarg ); + + $text = trim( $text ); + if ( strlen( $text ) == 0 ) { + return ''; + } + $old = $parser->getCustomDefaultSort(); + if ( $old === false || $arg !== 'defaultsort_noreplace' ) { + $parser->setDefaultSort( $text ); + } + + if ( $old === false || $old == $text || $arg ) { + return ''; + } else { + $converter = $parser->getConverterLanguage()->getConverter(); + return '<span class="error">' . + wfMessage( 'duplicate-defaultsort', + // Message should be parsed, but these params should only be escaped. + $converter->markNoConversion( wfEscapeWikiText( $old ) ), + $converter->markNoConversion( wfEscapeWikiText( $text ) ) + )->inContentLanguage()->text() . + '</span>'; + } + } + + /** + * Usage {{filepath|300}}, {{filepath|nowiki}}, {{filepath|nowiki|300}} + * or {{filepath|300|nowiki}} or {{filepath|300px}}, {{filepath|200x300px}}, + * {{filepath|nowiki|200x300px}}, {{filepath|200x300px|nowiki}}. + * + * @param Parser $parser + * @param string $name + * @param string $argA + * @param string $argB + * @return array|string + */ + public static function filepath( $parser, $name = '', $argA = '', $argB = '' ) { + $file = wfFindFile( $name ); + + if ( $argA == 'nowiki' ) { + // {{filepath: | option [| size] }} + $isNowiki = true; + $parsedWidthParam = Parser::parseWidthParam( $argB ); + } else { + // {{filepath: [| size [|option]] }} + $parsedWidthParam = Parser::parseWidthParam( $argA ); + $isNowiki = ( $argB == 'nowiki' ); + } + + if ( $file ) { + $url = $file->getFullUrl(); + + // If a size is requested... + if ( count( $parsedWidthParam ) ) { + $mto = $file->transform( $parsedWidthParam ); + // ... and we can + if ( $mto && !$mto->isError() ) { + // ... change the URL to point to a thumbnail. + $url = wfExpandUrl( $mto->getUrl(), PROTO_RELATIVE ); + } + } + if ( $isNowiki ) { + return [ $url, 'nowiki' => true ]; + } + return $url; + } else { + return ''; + } + } + + /** + * Parser function to extension tag adaptor + * @param Parser $parser + * @param PPFrame $frame + * @param PPNode[] $args + * @return string + */ + public static function tagObj( $parser, $frame, $args ) { + if ( !count( $args ) ) { + return ''; + } + $tagName = strtolower( trim( $frame->expand( array_shift( $args ) ) ) ); + + if ( count( $args ) ) { + $inner = $frame->expand( array_shift( $args ) ); + } else { + $inner = null; + } + + $attributes = []; + foreach ( $args as $arg ) { + $bits = $arg->splitArg(); + if ( strval( $bits['index'] ) === '' ) { + $name = trim( $frame->expand( $bits['name'], PPFrame::STRIP_COMMENTS ) ); + $value = trim( $frame->expand( $bits['value'] ) ); + if ( preg_match( '/^(?:["\'](.+)["\']|""|\'\')$/s', $value, $m ) ) { + $value = isset( $m[1] ) ? $m[1] : ''; + } + $attributes[$name] = $value; + } + } + + $stripList = $parser->getStripList(); + if ( !in_array( $tagName, $stripList ) ) { + // we can't handle this tag (at least not now), so just re-emit it as an ordinary tag + $attrText = ''; + foreach ( $attributes as $name => $value ) { + $attrText .= ' ' . htmlspecialchars( $name ) . '="' . htmlspecialchars( $value ) . '"'; + } + if ( $inner === null ) { + return "<$tagName$attrText/>"; + } + return "<$tagName$attrText>$inner</$tagName>"; + } + + $params = [ + 'name' => $tagName, + 'inner' => $inner, + 'attributes' => $attributes, + 'close' => "</$tagName>", + ]; + return $parser->extensionSubstitution( $params, $frame ); + } + + /** + * Fetched the current revision of the given title and return this. + * Will increment the expensive function count and + * add a template link to get the value refreshed on changes. + * For a given title, which is equal to the current parser title, + * the revision object from the parser is used, when that is the current one + * + * @param Parser $parser + * @param Title $title + * @return Revision + * @since 1.23 + */ + private static function getCachedRevisionObject( $parser, $title = null ) { + if ( is_null( $title ) ) { + return null; + } + + // Use the revision from the parser itself, when param is the current page + // and the revision is the current one + if ( $title->equals( $parser->getTitle() ) ) { + $parserRev = $parser->getRevisionObject(); + if ( $parserRev && $parserRev->isCurrent() ) { + // force reparse after edit with vary-revision flag + $parser->getOutput()->setFlag( 'vary-revision' ); + wfDebug( __METHOD__ . ": use current revision from parser, setting vary-revision...\n" ); + return $parserRev; + } + } + + // Normalize name for cache + $page = $title->getPrefixedDBkey(); + + if ( !( $parser->currentRevisionCache && $parser->currentRevisionCache->has( $page ) ) + && !$parser->incrementExpensiveFunctionCount() ) { + return null; + } + $rev = $parser->fetchCurrentRevisionOfTitle( $title ); + $pageID = $rev ? $rev->getPage() : 0; + $revID = $rev ? $rev->getId() : 0; + + // Register dependency in templatelinks + $parser->getOutput()->addTemplate( $title, $pageID, $revID ); + + return $rev; + } + + /** + * Get the pageid of a specified page + * @param Parser $parser + * @param string $title Title to get the pageid from + * @return int|null|string + * @since 1.23 + */ + public static function pageid( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + // Use title from parser to have correct pageid after edit + if ( $t->equals( $parser->getTitle() ) ) { + $t = $parser->getTitle(); + return $t->getArticleID(); + } + + // These can't have ids + if ( !$t->canExist() || $t->isExternal() ) { + return 0; + } + + // Check the link cache, maybe something already looked it up. + $linkCache = LinkCache::singleton(); + $pdbk = $t->getPrefixedDBkey(); + $id = $linkCache->getGoodLinkID( $pdbk ); + if ( $id != 0 ) { + $parser->mOutput->addLink( $t, $id ); + return $id; + } + if ( $linkCache->isBadLink( $pdbk ) ) { + $parser->mOutput->addLink( $t, 0 ); + return $id; + } + + // We need to load it from the DB, so mark expensive + if ( $parser->incrementExpensiveFunctionCount() ) { + $id = $t->getArticleID(); + $parser->mOutput->addLink( $t, $id ); + return $id; + } + return null; + } + + /** + * Get the id from the last revision of a specified page. + * @param Parser $parser + * @param string $title Title to get the id from + * @return int|null|string + * @since 1.23 + */ + public static function revisionid( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + // fetch revision from cache/database and return the value + $rev = self::getCachedRevisionObject( $parser, $t ); + return $rev ? $rev->getId() : ''; + } + + /** + * Get the day from the last revision of a specified page. + * @param Parser $parser + * @param string $title Title to get the day from + * @return string + * @since 1.23 + */ + public static function revisionday( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + // fetch revision from cache/database and return the value + $rev = self::getCachedRevisionObject( $parser, $t ); + return $rev ? MWTimestamp::getLocalInstance( $rev->getTimestamp() )->format( 'j' ) : ''; + } + + /** + * Get the day with leading zeros from the last revision of a specified page. + * @param Parser $parser + * @param string $title Title to get the day from + * @return string + * @since 1.23 + */ + public static function revisionday2( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + // fetch revision from cache/database and return the value + $rev = self::getCachedRevisionObject( $parser, $t ); + return $rev ? MWTimestamp::getLocalInstance( $rev->getTimestamp() )->format( 'd' ) : ''; + } + + /** + * Get the month with leading zeros from the last revision of a specified page. + * @param Parser $parser + * @param string $title Title to get the month from + * @return string + * @since 1.23 + */ + public static function revisionmonth( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + // fetch revision from cache/database and return the value + $rev = self::getCachedRevisionObject( $parser, $t ); + return $rev ? MWTimestamp::getLocalInstance( $rev->getTimestamp() )->format( 'm' ) : ''; + } + + /** + * Get the month from the last revision of a specified page. + * @param Parser $parser + * @param string $title Title to get the month from + * @return string + * @since 1.23 + */ + public static function revisionmonth1( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + // fetch revision from cache/database and return the value + $rev = self::getCachedRevisionObject( $parser, $t ); + return $rev ? MWTimestamp::getLocalInstance( $rev->getTimestamp() )->format( 'n' ) : ''; + } + + /** + * Get the year from the last revision of a specified page. + * @param Parser $parser + * @param string $title Title to get the year from + * @return string + * @since 1.23 + */ + public static function revisionyear( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + // fetch revision from cache/database and return the value + $rev = self::getCachedRevisionObject( $parser, $t ); + return $rev ? MWTimestamp::getLocalInstance( $rev->getTimestamp() )->format( 'Y' ) : ''; + } + + /** + * Get the timestamp from the last revision of a specified page. + * @param Parser $parser + * @param string $title Title to get the timestamp from + * @return string + * @since 1.23 + */ + public static function revisiontimestamp( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + // fetch revision from cache/database and return the value + $rev = self::getCachedRevisionObject( $parser, $t ); + return $rev ? MWTimestamp::getLocalInstance( $rev->getTimestamp() )->format( 'YmdHis' ) : ''; + } + + /** + * Get the user from the last revision of a specified page. + * @param Parser $parser + * @param string $title Title to get the user from + * @return string + * @since 1.23 + */ + public static function revisionuser( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null( $t ) ) { + return ''; + } + // fetch revision from cache/database and return the value + $rev = self::getCachedRevisionObject( $parser, $t ); + return $rev ? $rev->getUserText() : ''; + } + + /** + * Returns the sources of any cascading protection acting on a specified page. + * Pages will not return their own title unless they transclude themselves. + * This is an expensive parser function and can't be called too many times per page, + * unless cascading protection sources for the page have already been loaded. + * + * @param Parser $parser + * @param string $title + * + * @return string + * @since 1.23 + */ + public static function cascadingsources( $parser, $title = '' ) { + $titleObject = Title::newFromText( $title ); + if ( !( $titleObject instanceof Title ) ) { + $titleObject = $parser->mTitle; + } + if ( $titleObject->areCascadeProtectionSourcesLoaded() + || $parser->incrementExpensiveFunctionCount() + ) { + $names = []; + $sources = $titleObject->getCascadeProtectionSources(); + foreach ( $sources[0] as $sourceTitle ) { + $names[] = $sourceTitle->getPrefixedText(); + } + return implode( '|', $names ); + } + return ''; + } + +} diff --git a/www/wiki/includes/parser/CoreTagHooks.php b/www/wiki/includes/parser/CoreTagHooks.php new file mode 100644 index 00000000..438603a8 --- /dev/null +++ b/www/wiki/includes/parser/CoreTagHooks.php @@ -0,0 +1,176 @@ +<?php +/** + * Tag hooks provided by MediaWiki core + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ + +/** + * Various tag hooks, registered in Parser::firstCallInit() + * @ingroup Parser + */ +class CoreTagHooks { + /** + * @param Parser $parser + * @return void + */ + public static function register( $parser ) { + global $wgRawHtml; + $parser->setHook( 'pre', [ __CLASS__, 'pre' ] ); + $parser->setHook( 'nowiki', [ __CLASS__, 'nowiki' ] ); + $parser->setHook( 'gallery', [ __CLASS__, 'gallery' ] ); + $parser->setHook( 'indicator', [ __CLASS__, 'indicator' ] ); + if ( $wgRawHtml ) { + $parser->setHook( 'html', [ __CLASS__, 'html' ] ); + } + } + + /** + * Core parser tag hook function for 'pre'. + * Text is treated roughly as 'nowiki' wrapped in an HTML 'pre' tag; + * valid HTML attributes are passed on. + * + * @param string $text + * @param array $attribs + * @param Parser $parser + * @return string HTML + */ + public static function pre( $text, $attribs, $parser ) { + // Backwards-compatibility hack + $content = StringUtils::delimiterReplace( '<nowiki>', '</nowiki>', '$1', $text, 'i' ); + + $attribs = Sanitizer::validateTagAttributes( $attribs, 'pre' ); + // We need to let both '"' and '&' through, + // for strip markers and entities respectively. + $content = str_replace( + [ '>', '<' ], + [ '>', '<' ], + $content + ); + return Html::rawElement( 'pre', $attribs, $content ); + } + + /** + * Core parser tag hook function for 'html', used only when + * $wgRawHtml is enabled. + * + * This is potentially unsafe and should be used only in very careful + * circumstances, as the contents are emitted as raw HTML. + * + * Uses undocumented extended tag hook return values, introduced in r61913. + * + * @param string $content + * @param array $attributes + * @param Parser $parser + * @throws MWException + * @return array|string Output of tag hook + */ + public static function html( $content, $attributes, $parser ) { + global $wgRawHtml; + if ( $wgRawHtml ) { + if ( $parser->getOptions()->getAllowUnsafeRawHtml() ) { + return [ $content, 'markerType' => 'nowiki' ]; + } else { + // In a system message where raw html is + // not allowed (but it is allowed in other + // contexts). + return Html::rawElement( + 'span', + [ 'class' => 'error' ], + // Using ->text() not ->parse() as + // a paranoia measure against a loop. + wfMessage( 'rawhtml-notallowed' )->escaped() + ); + } + } else { + throw new MWException( '<html> extension tag encountered unexpectedly' ); + } + } + + /** + * Core parser tag hook function for 'nowiki'. Text within this section + * gets interpreted as a string of text with HTML-compatible character + * references, and wiki markup within it will not be expanded. + * + * Uses undocumented extended tag hook return values, introduced in r61913. + * + * @param string $content + * @param array $attributes + * @param Parser $parser + * @return array + */ + public static function nowiki( $content, $attributes, $parser ) { + $content = strtr( $content, [ + // lang converter + '-{' => '-{', + '}-' => '}-', + // html tags + '<' => '<', + '>' => '>' + // Note: Both '"' and '&' are not converted. + // This allows strip markers and entities through. + ] ); + return [ $content, 'markerType' => 'nowiki' ]; + } + + /** + * Core parser tag hook function for 'gallery'. + * + * Renders a thumbnail list of the given images, with optional captions. + * Full syntax documented on the wiki: + * + * https://www.mediawiki.org/wiki/Help:Images#Gallery_syntax + * + * @todo break Parser::renderImageGallery out here too. + * + * @param string $content + * @param array $attributes + * @param Parser $parser + * @return string HTML + */ + public static function gallery( $content, $attributes, $parser ) { + return $parser->renderImageGallery( $content, $attributes ); + } + + /** + * XML-style tag for page status indicators: icons (or short text snippets) usually displayed in + * the top-right corner of the page, outside of the main content. + * + * @param string $content + * @param array $attributes + * @param Parser $parser + * @param PPFrame $frame + * @return string + * @since 1.25 + */ + public static function indicator( $content, array $attributes, Parser $parser, PPFrame $frame ) { + if ( !isset( $attributes['name'] ) || trim( $attributes['name'] ) === '' ) { + return '<span class="error">' . + wfMessage( 'invalid-indicator-name' )->inContentLanguage()->parse() . + '</span>'; + } + + $parser->getOutput()->setIndicator( + trim( $attributes['name'] ), + Parser::stripOuterParagraph( $parser->recursiveTagParseFully( $content, $frame ) ) + ); + + return ''; + } +} diff --git a/www/wiki/includes/parser/DateFormatter.php b/www/wiki/includes/parser/DateFormatter.php new file mode 100644 index 00000000..0a4a60e9 --- /dev/null +++ b/www/wiki/includes/parser/DateFormatter.php @@ -0,0 +1,391 @@ +<?php +/** + * Date formatter + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ + +/** + * Date formatter, recognises dates in plain text and formats them according to user preferences. + * @todo preferences, OutputPage + * @ingroup Parser + */ +class DateFormatter { + private $mSource, $mTarget; + private $monthNames = ''; + + private $regexes; + private $rules, $xMonths, $preferences; + + private $lang, $mLinked; + + /** @var string[] */ + private $keys; + + /** @var string[] */ + private $targets; + + const ALL = -1; + const NONE = 0; + const MDY = 1; + const DMY = 2; + const YMD = 3; + const ISO1 = 4; + const LASTPREF = 4; + const ISO2 = 5; + const YDM = 6; + const DM = 7; + const MD = 8; + const LAST = 8; + + /** + * @param Language $lang In which language to format the date + */ + public function __construct( Language $lang ) { + $this->lang = $lang; + + $this->monthNames = $this->getMonthRegex(); + for ( $i = 1; $i <= 12; $i++ ) { + $this->xMonths[$this->lang->lc( $this->lang->getMonthName( $i ) )] = $i; + $this->xMonths[$this->lang->lc( $this->lang->getMonthAbbreviation( $i ) )] = $i; + } + + $this->regexTrail = '(?![a-z])/iu'; + + # Partial regular expressions + $this->prxDM = '\[\[(\d{1,2})[ _](' . $this->monthNames . ')\]\]'; + $this->prxMD = '\[\[(' . $this->monthNames . ')[ _](\d{1,2})\]\]'; + $this->prxY = '\[\[(\d{1,4}([ _]BC|))\]\]'; + $this->prxISO1 = '\[\[(-?\d{4})]]-\[\[(\d{2})-(\d{2})\]\]'; + $this->prxISO2 = '\[\[(-?\d{4})-(\d{2})-(\d{2})\]\]'; + + # Real regular expressions + $this->regexes[self::DMY] = "/{$this->prxDM}(?: *, *| +){$this->prxY}{$this->regexTrail}"; + $this->regexes[self::YDM] = "/{$this->prxY}(?: *, *| +){$this->prxDM}{$this->regexTrail}"; + $this->regexes[self::MDY] = "/{$this->prxMD}(?: *, *| +){$this->prxY}{$this->regexTrail}"; + $this->regexes[self::YMD] = "/{$this->prxY}(?: *, *| +){$this->prxMD}{$this->regexTrail}"; + $this->regexes[self::DM] = "/{$this->prxDM}{$this->regexTrail}"; + $this->regexes[self::MD] = "/{$this->prxMD}{$this->regexTrail}"; + $this->regexes[self::ISO1] = "/{$this->prxISO1}{$this->regexTrail}"; + $this->regexes[self::ISO2] = "/{$this->prxISO2}{$this->regexTrail}"; + + # Extraction keys + # See the comments in replace() for the meaning of the letters + $this->keys[self::DMY] = 'jFY'; + $this->keys[self::YDM] = 'Y jF'; + $this->keys[self::MDY] = 'FjY'; + $this->keys[self::YMD] = 'Y Fj'; + $this->keys[self::DM] = 'jF'; + $this->keys[self::MD] = 'Fj'; + $this->keys[self::ISO1] = 'ymd'; # y means ISO year + $this->keys[self::ISO2] = 'ymd'; + + # Target date formats + $this->targets[self::DMY] = '[[F j|j F]] [[Y]]'; + $this->targets[self::YDM] = '[[Y]], [[F j|j F]]'; + $this->targets[self::MDY] = '[[F j]], [[Y]]'; + $this->targets[self::YMD] = '[[Y]] [[F j]]'; + $this->targets[self::DM] = '[[F j|j F]]'; + $this->targets[self::MD] = '[[F j]]'; + $this->targets[self::ISO1] = '[[Y|y]]-[[F j|m-d]]'; + $this->targets[self::ISO2] = '[[y-m-d]]'; + + # Rules + # pref source target + $this->rules[self::DMY][self::MD] = self::DM; + $this->rules[self::ALL][self::MD] = self::MD; + $this->rules[self::MDY][self::DM] = self::MD; + $this->rules[self::ALL][self::DM] = self::DM; + $this->rules[self::NONE][self::ISO2] = self::ISO1; + + $this->preferences = [ + 'default' => self::NONE, + 'dmy' => self::DMY, + 'mdy' => self::MDY, + 'ymd' => self::YMD, + 'ISO 8601' => self::ISO1, + ]; + } + + /** + * Get a DateFormatter object + * + * @param Language|null $lang In which language to format the date + * Defaults to the site content language + * @return DateFormatter + */ + public static function getInstance( $lang = null ) { + global $wgContLang, $wgMainCacheType; + + if ( is_string( $lang ) ) { + wfDeprecated( __METHOD__ . ' with type string for $lang', '1.31' ); + } + $lang = $lang ? wfGetLangObj( $lang ) : $wgContLang; + $cache = ObjectCache::getLocalServerInstance( $wgMainCacheType ); + + static $dateFormatter = false; + if ( !$dateFormatter ) { + $dateFormatter = $cache->getWithSetCallback( + $cache->makeKey( 'dateformatter', $lang->getCode() ), + $cache::TTL_HOUR, + function () use ( $lang ) { + return new DateFormatter( $lang ); + } + ); + } + + return $dateFormatter; + } + + /** + * @param string $preference User preference + * @param string $text Text to reformat + * @param array $options Array can contain 'linked' and/or 'match-whole' + * + * @return string + */ + public function reformat( $preference, $text, $options = [ 'linked' ] ) { + $linked = in_array( 'linked', $options ); + $match_whole = in_array( 'match-whole', $options ); + + if ( isset( $this->preferences[$preference] ) ) { + $preference = $this->preferences[$preference]; + } else { + $preference = self::NONE; + } + for ( $i = 1; $i <= self::LAST; $i++ ) { + $this->mSource = $i; + if ( isset( $this->rules[$preference][$i] ) ) { + # Specific rules + $this->mTarget = $this->rules[$preference][$i]; + } elseif ( isset( $this->rules[self::ALL][$i] ) ) { + # General rules + $this->mTarget = $this->rules[self::ALL][$i]; + } elseif ( $preference ) { + # User preference + $this->mTarget = $preference; + } else { + # Default + $this->mTarget = $i; + } + $regex = $this->regexes[$i]; + + // Horrible hack + if ( !$linked ) { + $regex = str_replace( [ '\[\[', '\]\]' ], '', $regex ); + } + + if ( $match_whole ) { + // Let's hope this works + $regex = preg_replace( '!^/!', '/^', $regex ); + $regex = str_replace( $this->regexTrail, + '$' . $this->regexTrail, $regex ); + } + + // Another horrible hack + $this->mLinked = $linked; + $text = preg_replace_callback( $regex, [ $this, 'replace' ], $text ); + unset( $this->mLinked ); + } + return $text; + } + + /** + * Regexp replacement callback + * + * @param array $matches + * @return string + */ + private function replace( $matches ) { + # Extract information from $matches + $linked = true; + if ( isset( $this->mLinked ) ) { + $linked = $this->mLinked; + } + + $bits = []; + $key = $this->keys[$this->mSource]; + $keyLength = strlen( $key ); + for ( $p = 0; $p < $keyLength; $p++ ) { + if ( $key[$p] != ' ' ) { + $bits[$key[$p]] = $matches[$p + 1]; + } + } + + return $this->formatDate( $bits, $matches[0], $linked ); + } + + /** + * @param array $bits + * @param string $orig Original input string, to be returned + * on formatting failure. + * @param bool $link + * @return string + */ + private function formatDate( $bits, $orig, $link = true ) { + $format = $this->targets[$this->mTarget]; + + if ( !$link ) { + // strip piped links + $format = preg_replace( '/\[\[[^|]+\|([^\]]+)\]\]/', '$1', $format ); + // strip remaining links + $format = str_replace( [ '[[', ']]' ], '', $format ); + } + + # Construct new date + $text = ''; + $fail = false; + + // Pre-generate y/Y stuff because we need the year for the <span> title. + if ( !isset( $bits['y'] ) && isset( $bits['Y'] ) ) { + $bits['y'] = $this->makeIsoYear( $bits['Y'] ); + } + if ( !isset( $bits['Y'] ) && isset( $bits['y'] ) ) { + $bits['Y'] = $this->makeNormalYear( $bits['y'] ); + } + + if ( !isset( $bits['m'] ) ) { + $m = $this->makeIsoMonth( $bits['F'] ); + if ( !$m || $m == '00' ) { + $fail = true; + } else { + $bits['m'] = $m; + } + } + + if ( !isset( $bits['d'] ) ) { + $bits['d'] = sprintf( '%02d', $bits['j'] ); + } + + $formatLength = strlen( $format ); + for ( $p = 0; $p < $formatLength; $p++ ) { + $char = $format[$p]; + switch ( $char ) { + case 'd': # ISO day of month + $text .= $bits['d']; + break; + case 'm': # ISO month + $text .= $bits['m']; + break; + case 'y': # ISO year + $text .= $bits['y']; + break; + case 'j': # ordinary day of month + if ( !isset( $bits['j'] ) ) { + $text .= intval( $bits['d'] ); + } else { + $text .= $bits['j']; + } + break; + case 'F': # long month + if ( !isset( $bits['F'] ) ) { + $m = intval( $bits['m'] ); + if ( $m > 12 || $m < 1 ) { + $fail = true; + } else { + $text .= $this->lang->getMonthName( $m ); + } + } else { + $text .= ucfirst( $bits['F'] ); + } + break; + case 'Y': # ordinary (optional BC) year + $text .= $bits['Y']; + break; + default: + $text .= $char; + } + } + if ( $fail ) { + // This occurs when parsing a date with day or month outside the bounds + // of possibilities. + $text = $orig; + } + + $isoBits = []; + if ( isset( $bits['y'] ) ) { + $isoBits[] = $bits['y']; + } + $isoBits[] = $bits['m']; + $isoBits[] = $bits['d']; + $isoDate = implode( '-', $isoBits ); + + // Output is not strictly HTML (it's wikitext), but <span> is whitelisted. + $text = Html::rawElement( 'span', + [ 'class' => 'mw-formatted-date', 'title' => $isoDate ], $text ); + + return $text; + } + + /** + * Return a regex that can be used to find month names in string + * @return string regex to find the months with + */ + private function getMonthRegex() { + $names = []; + for ( $i = 1; $i <= 12; $i++ ) { + $names[] = $this->lang->getMonthName( $i ); + $names[] = $this->lang->getMonthAbbreviation( $i ); + } + return implode( '|', $names ); + } + + /** + * Makes an ISO month, e.g. 02, from a month name + * @param string $monthName Month name + * @return string ISO month name + */ + private function makeIsoMonth( $monthName ) { + $n = $this->xMonths[$this->lang->lc( $monthName )]; + return sprintf( '%02d', $n ); + } + + /** + * Make an ISO year from a year name, for instance: '-1199' from '1200 BC' + * @param string $year Year name + * @return string ISO year name + */ + private function makeIsoYear( $year ) { + # Assumes the year is in a nice format, as enforced by the regex + if ( substr( $year, -2 ) == 'BC' ) { + $num = intval( substr( $year, 0, -3 ) ) - 1; + # PHP bug note: sprintf( "%04d", -1 ) fails poorly + $text = sprintf( '-%04d', $num ); + + } else { + $text = sprintf( '%04d', $year ); + } + return $text; + } + + /** + * Make a year one from an ISO year, for instance: '400 BC' from '-0399'. + * @param string $iso ISO year + * @return int|string int representing year number in case of AD dates, or string containing + * year number and 'BC' at the end otherwise. + */ + private function makeNormalYear( $iso ) { + if ( $iso[0] == '-' ) { + $text = ( intval( substr( $iso, 1 ) ) + 1 ) . ' BC'; + } else { + $text = intval( $iso ); + } + return $text; + } +} diff --git a/www/wiki/includes/parser/LinkHolderArray.php b/www/wiki/includes/parser/LinkHolderArray.php new file mode 100644 index 00000000..816f7f79 --- /dev/null +++ b/www/wiki/includes/parser/LinkHolderArray.php @@ -0,0 +1,644 @@ +<?php +/** + * Holder of replacement pairs for wiki links + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ + +/** + * @ingroup Parser + */ +class LinkHolderArray { + public $internals = []; + public $interwikis = []; + public $size = 0; + + /** + * @var Parser + */ + public $parent; + protected $tempIdOffset; + + /** + * @param Parser $parent + */ + public function __construct( $parent ) { + $this->parent = $parent; + } + + /** + * Reduce memory usage to reduce the impact of circular references + */ + public function __destruct() { + foreach ( $this as $name => $value ) { + unset( $this->$name ); + } + } + + /** + * Don't serialize the parent object, it is big, and not needed when it is + * a parameter to mergeForeign(), which is the only application of + * serializing at present. + * + * Compact the titles, only serialize the text form. + * @return array + */ + public function __sleep() { + foreach ( $this->internals as &$nsLinks ) { + foreach ( $nsLinks as &$entry ) { + unset( $entry['title'] ); + } + } + unset( $nsLinks ); + unset( $entry ); + + foreach ( $this->interwikis as &$entry ) { + unset( $entry['title'] ); + } + unset( $entry ); + + return [ 'internals', 'interwikis', 'size' ]; + } + + /** + * Recreate the Title objects + */ + public function __wakeup() { + foreach ( $this->internals as &$nsLinks ) { + foreach ( $nsLinks as &$entry ) { + $entry['title'] = Title::newFromText( $entry['pdbk'] ); + } + } + unset( $nsLinks ); + unset( $entry ); + + foreach ( $this->interwikis as &$entry ) { + $entry['title'] = Title::newFromText( $entry['pdbk'] ); + } + unset( $entry ); + } + + /** + * Merge another LinkHolderArray into this one + * @param LinkHolderArray $other + */ + public function merge( $other ) { + foreach ( $other->internals as $ns => $entries ) { + $this->size += count( $entries ); + if ( !isset( $this->internals[$ns] ) ) { + $this->internals[$ns] = $entries; + } else { + $this->internals[$ns] += $entries; + } + } + $this->interwikis += $other->interwikis; + } + + /** + * Merge a LinkHolderArray from another parser instance into this one. The + * keys will not be preserved. Any text which went with the old + * LinkHolderArray and needs to work with the new one should be passed in + * the $texts array. The strings in this array will have their link holders + * converted for use in the destination link holder. The resulting array of + * strings will be returned. + * + * @param LinkHolderArray $other + * @param array $texts Array of strings + * @return array + */ + public function mergeForeign( $other, $texts ) { + $this->tempIdOffset = $idOffset = $this->parent->nextLinkID(); + $maxId = 0; + + # Renumber internal links + foreach ( $other->internals as $ns => $nsLinks ) { + foreach ( $nsLinks as $key => $entry ) { + $newKey = $idOffset + $key; + $this->internals[$ns][$newKey] = $entry; + $maxId = $newKey > $maxId ? $newKey : $maxId; + } + } + $texts = preg_replace_callback( '/(<!--LINK\'" \d+:)(\d+)(-->)/', + [ $this, 'mergeForeignCallback' ], $texts ); + + # Renumber interwiki links + foreach ( $other->interwikis as $key => $entry ) { + $newKey = $idOffset + $key; + $this->interwikis[$newKey] = $entry; + $maxId = $newKey > $maxId ? $newKey : $maxId; + } + $texts = preg_replace_callback( '/(<!--IWLINK\'" )(\d+)(-->)/', + [ $this, 'mergeForeignCallback' ], $texts ); + + # Set the parent link ID to be beyond the highest used ID + $this->parent->setLinkID( $maxId + 1 ); + $this->tempIdOffset = null; + return $texts; + } + + /** + * @param array $m + * @return string + */ + protected function mergeForeignCallback( $m ) { + return $m[1] . ( $m[2] + $this->tempIdOffset ) . $m[3]; + } + + /** + * Get a subset of the current LinkHolderArray which is sufficient to + * interpret the given text. + * @param string $text + * @return LinkHolderArray + */ + public function getSubArray( $text ) { + $sub = new LinkHolderArray( $this->parent ); + + # Internal links + $pos = 0; + while ( $pos < strlen( $text ) ) { + if ( !preg_match( '/<!--LINK\'" (\d+):(\d+)-->/', + $text, $m, PREG_OFFSET_CAPTURE, $pos ) + ) { + break; + } + $ns = $m[1][0]; + $key = $m[2][0]; + $sub->internals[$ns][$key] = $this->internals[$ns][$key]; + $pos = $m[0][1] + strlen( $m[0][0] ); + } + + # Interwiki links + $pos = 0; + while ( $pos < strlen( $text ) ) { + if ( !preg_match( '/<!--IWLINK\'" (\d+)-->/', $text, $m, PREG_OFFSET_CAPTURE, $pos ) ) { + break; + } + $key = $m[1][0]; + $sub->interwikis[$key] = $this->interwikis[$key]; + $pos = $m[0][1] + strlen( $m[0][0] ); + } + return $sub; + } + + /** + * Returns true if the memory requirements of this object are getting large + * @return bool + */ + public function isBig() { + global $wgLinkHolderBatchSize; + return $this->size > $wgLinkHolderBatchSize; + } + + /** + * Clear all stored link holders. + * Make sure you don't have any text left using these link holders, before you call this + */ + public function clear() { + $this->internals = []; + $this->interwikis = []; + $this->size = 0; + } + + /** + * Make a link placeholder. The text returned can be later resolved to a real link with + * replaceLinkHolders(). This is done for two reasons: firstly to avoid further + * parsing of interwiki links, and secondly to allow all existence checks and + * article length checks (for stub links) to be bundled into a single query. + * + * @param Title $nt + * @param string $text + * @param array $query [optional] + * @param string $trail [optional] + * @param string $prefix [optional] + * @return string + */ + public function makeHolder( $nt, $text = '', $query = [], $trail = '', $prefix = '' ) { + if ( !is_object( $nt ) ) { + # Fail gracefully + $retVal = "<!-- ERROR -->{$prefix}{$text}{$trail}"; + } else { + # Separate the link trail from the rest of the link + list( $inside, $trail ) = Linker::splitTrail( $trail ); + + $entry = [ + 'title' => $nt, + 'text' => $prefix . $text . $inside, + 'pdbk' => $nt->getPrefixedDBkey(), + ]; + if ( $query !== [] ) { + $entry['query'] = $query; + } + + if ( $nt->isExternal() ) { + // Use a globally unique ID to keep the objects mergable + $key = $this->parent->nextLinkID(); + $this->interwikis[$key] = $entry; + $retVal = "<!--IWLINK'\" $key-->{$trail}"; + } else { + $key = $this->parent->nextLinkID(); + $ns = $nt->getNamespace(); + $this->internals[$ns][$key] = $entry; + $retVal = "<!--LINK'\" $ns:$key-->{$trail}"; + } + $this->size++; + } + return $retVal; + } + + /** + * Replace <!--LINK--> link placeholders with actual links, in the buffer + * + * @param string &$text + */ + public function replace( &$text ) { + $this->replaceInternal( $text ); + $this->replaceInterwiki( $text ); + } + + /** + * Replace internal links + * @param string &$text + */ + protected function replaceInternal( &$text ) { + if ( !$this->internals ) { + return; + } + + global $wgContLang; + + $colours = []; + $linkCache = LinkCache::singleton(); + $output = $this->parent->getOutput(); + $linkRenderer = $this->parent->getLinkRenderer(); + + $dbr = wfGetDB( DB_REPLICA ); + + # Sort by namespace + ksort( $this->internals ); + + $linkcolour_ids = []; + + # Generate query + $lb = new LinkBatch(); + $lb->setCaller( __METHOD__ ); + + foreach ( $this->internals as $ns => $entries ) { + foreach ( $entries as $entry ) { + /** @var Title $title */ + $title = $entry['title']; + $pdbk = $entry['pdbk']; + + # Skip invalid entries. + # Result will be ugly, but prevents crash. + if ( is_null( $title ) ) { + continue; + } + + # Check if it's a static known link, e.g. interwiki + if ( $title->isAlwaysKnown() ) { + $colours[$pdbk] = ''; + } elseif ( $ns == NS_SPECIAL ) { + $colours[$pdbk] = 'new'; + } else { + $id = $linkCache->getGoodLinkID( $pdbk ); + if ( $id != 0 ) { + $colours[$pdbk] = $linkRenderer->getLinkClasses( $title ); + $output->addLink( $title, $id ); + $linkcolour_ids[$id] = $pdbk; + } elseif ( $linkCache->isBadLink( $pdbk ) ) { + $colours[$pdbk] = 'new'; + } else { + # Not in the link cache, add it to the query + $lb->addObj( $title ); + } + } + } + } + if ( !$lb->isEmpty() ) { + $fields = array_merge( + LinkCache::getSelectFields(), + [ 'page_namespace', 'page_title' ] + ); + + $res = $dbr->select( + 'page', + $fields, + $lb->constructSet( 'page', $dbr ), + __METHOD__ + ); + + # Fetch data and form into an associative array + # non-existent = broken + foreach ( $res as $s ) { + $title = Title::makeTitle( $s->page_namespace, $s->page_title ); + $pdbk = $title->getPrefixedDBkey(); + $linkCache->addGoodLinkObjFromRow( $title, $s ); + $output->addLink( $title, $s->page_id ); + $colours[$pdbk] = $linkRenderer->getLinkClasses( $title ); + // add id to the extension todolist + $linkcolour_ids[$s->page_id] = $pdbk; + } + unset( $res ); + } + if ( count( $linkcolour_ids ) ) { + // pass an array of page_ids to an extension + Hooks::run( 'GetLinkColours', [ $linkcolour_ids, &$colours ] ); + } + + # Do a second query for different language variants of links and categories + if ( $wgContLang->hasVariants() ) { + $this->doVariants( $colours ); + } + + # Construct search and replace arrays + $replacePairs = []; + foreach ( $this->internals as $ns => $entries ) { + foreach ( $entries as $index => $entry ) { + $pdbk = $entry['pdbk']; + $title = $entry['title']; + $query = isset( $entry['query'] ) ? $entry['query'] : []; + $key = "$ns:$index"; + $searchkey = "<!--LINK'\" $key-->"; + $displayText = $entry['text']; + if ( isset( $entry['selflink'] ) ) { + $replacePairs[$searchkey] = Linker::makeSelfLinkObj( $title, $displayText, $query ); + continue; + } + if ( $displayText === '' ) { + $displayText = null; + } else { + $displayText = new HtmlArmor( $displayText ); + } + if ( !isset( $colours[$pdbk] ) ) { + $colours[$pdbk] = 'new'; + } + $attribs = []; + if ( $colours[$pdbk] == 'new' ) { + $linkCache->addBadLinkObj( $title ); + $output->addLink( $title, 0 ); + $link = $linkRenderer->makeBrokenLink( + $title, $displayText, $attribs, $query + ); + } else { + $link = $linkRenderer->makePreloadedLink( + $title, $displayText, $colours[$pdbk], $attribs, $query + ); + } + + $replacePairs[$searchkey] = $link; + } + } + $replacer = new HashtableReplacer( $replacePairs, 1 ); + + # Do the thing + $text = preg_replace_callback( + '/(<!--LINK\'" .*?-->)/', + $replacer->cb(), + $text + ); + } + + /** + * Replace interwiki links + * @param string &$text + */ + protected function replaceInterwiki( &$text ) { + if ( empty( $this->interwikis ) ) { + return; + } + + # Make interwiki link HTML + $output = $this->parent->getOutput(); + $replacePairs = []; + $linkRenderer = $this->parent->getLinkRenderer(); + foreach ( $this->interwikis as $key => $link ) { + $replacePairs[$key] = $linkRenderer->makeLink( + $link['title'], + new HtmlArmor( $link['text'] ) + ); + $output->addInterwikiLink( $link['title'] ); + } + $replacer = new HashtableReplacer( $replacePairs, 1 ); + + $text = preg_replace_callback( + '/<!--IWLINK\'" (.*?)-->/', + $replacer->cb(), + $text ); + } + + /** + * Modify $this->internals and $colours according to language variant linking rules + * @param array &$colours + */ + protected function doVariants( &$colours ) { + global $wgContLang; + $linkBatch = new LinkBatch(); + $variantMap = []; // maps $pdbkey_Variant => $keys (of link holders) + $output = $this->parent->getOutput(); + $linkCache = LinkCache::singleton(); + $titlesToBeConverted = ''; + $titlesAttrs = []; + + // Concatenate titles to a single string, thus we only need auto convert the + // single string to all variants. This would improve parser's performance + // significantly. + foreach ( $this->internals as $ns => $entries ) { + if ( $ns == NS_SPECIAL ) { + continue; + } + foreach ( $entries as $index => $entry ) { + $pdbk = $entry['pdbk']; + // we only deal with new links (in its first query) + if ( !isset( $colours[$pdbk] ) || $colours[$pdbk] === 'new' ) { + $titlesAttrs[] = [ $index, $entry['title'] ]; + // separate titles with \0 because it would never appears + // in a valid title + $titlesToBeConverted .= $entry['title']->getText() . "\0"; + } + } + } + + // Now do the conversion and explode string to text of titles + $titlesAllVariants = $wgContLang->autoConvertToAllVariants( rtrim( $titlesToBeConverted, "\0" ) ); + $allVariantsName = array_keys( $titlesAllVariants ); + foreach ( $titlesAllVariants as &$titlesVariant ) { + $titlesVariant = explode( "\0", $titlesVariant ); + } + + // Then add variants of links to link batch + $parentTitle = $this->parent->getTitle(); + foreach ( $titlesAttrs as $i => $attrs ) { + /** @var Title $title */ + list( $index, $title ) = $attrs; + $ns = $title->getNamespace(); + $text = $title->getText(); + + foreach ( $allVariantsName as $variantName ) { + $textVariant = $titlesAllVariants[$variantName][$i]; + if ( $textVariant === $text ) { + continue; + } + + $variantTitle = Title::makeTitle( $ns, $textVariant ); + + // Self-link checking for mixed/different variant titles. At this point, we + // already know the exact title does not exist, so the link cannot be to a + // variant of the current title that exists as a separate page. + if ( $variantTitle->equals( $parentTitle ) && !$title->hasFragment() ) { + $this->internals[$ns][$index]['selflink'] = true; + continue 2; + } + + $linkBatch->addObj( $variantTitle ); + $variantMap[$variantTitle->getPrefixedDBkey()][] = "$ns:$index"; + } + } + + // process categories, check if a category exists in some variant + $categoryMap = []; // maps $category_variant => $category (dbkeys) + $varCategories = []; // category replacements oldDBkey => newDBkey + foreach ( $output->getCategoryLinks() as $category ) { + $categoryTitle = Title::makeTitleSafe( NS_CATEGORY, $category ); + $linkBatch->addObj( $categoryTitle ); + $variants = $wgContLang->autoConvertToAllVariants( $category ); + foreach ( $variants as $variant ) { + if ( $variant !== $category ) { + $variantTitle = Title::makeTitleSafe( NS_CATEGORY, $variant ); + if ( is_null( $variantTitle ) ) { + continue; + } + $linkBatch->addObj( $variantTitle ); + $categoryMap[$variant] = [ $category, $categoryTitle ]; + } + } + } + + if ( !$linkBatch->isEmpty() ) { + // construct query + $dbr = wfGetDB( DB_REPLICA ); + $fields = array_merge( + LinkCache::getSelectFields(), + [ 'page_namespace', 'page_title' ] + ); + + $varRes = $dbr->select( 'page', + $fields, + $linkBatch->constructSet( 'page', $dbr ), + __METHOD__ + ); + + $linkcolour_ids = []; + $linkRenderer = $this->parent->getLinkRenderer(); + + // for each found variants, figure out link holders and replace + foreach ( $varRes as $s ) { + $variantTitle = Title::makeTitle( $s->page_namespace, $s->page_title ); + $varPdbk = $variantTitle->getPrefixedDBkey(); + $vardbk = $variantTitle->getDBkey(); + + $holderKeys = []; + if ( isset( $variantMap[$varPdbk] ) ) { + $holderKeys = $variantMap[$varPdbk]; + $linkCache->addGoodLinkObjFromRow( $variantTitle, $s ); + $output->addLink( $variantTitle, $s->page_id ); + } + + // loop over link holders + foreach ( $holderKeys as $key ) { + list( $ns, $index ) = explode( ':', $key, 2 ); + $entry =& $this->internals[$ns][$index]; + $pdbk = $entry['pdbk']; + + if ( !isset( $colours[$pdbk] ) || $colours[$pdbk] === 'new' ) { + // found link in some of the variants, replace the link holder data + $entry['title'] = $variantTitle; + $entry['pdbk'] = $varPdbk; + + // set pdbk and colour + $colours[$varPdbk] = $linkRenderer->getLinkClasses( $variantTitle ); + $linkcolour_ids[$s->page_id] = $pdbk; + } + } + + // check if the object is a variant of a category + if ( isset( $categoryMap[$vardbk] ) ) { + list( $oldkey, $oldtitle ) = $categoryMap[$vardbk]; + if ( !isset( $varCategories[$oldkey] ) && !$oldtitle->exists() ) { + $varCategories[$oldkey] = $vardbk; + } + } + } + Hooks::run( 'GetLinkColours', [ $linkcolour_ids, &$colours ] ); + + // rebuild the categories in original order (if there are replacements) + if ( count( $varCategories ) > 0 ) { + $newCats = []; + $originalCats = $output->getCategories(); + foreach ( $originalCats as $cat => $sortkey ) { + // make the replacement + if ( array_key_exists( $cat, $varCategories ) ) { + $newCats[$varCategories[$cat]] = $sortkey; + } else { + $newCats[$cat] = $sortkey; + } + } + $output->setCategoryLinks( $newCats ); + } + } + } + + /** + * Replace <!--LINK--> link placeholders with plain text of links + * (not HTML-formatted). + * + * @param string $text + * @return string + */ + public function replaceText( $text ) { + $text = preg_replace_callback( + '/<!--(LINK|IWLINK)\'" (.*?)-->/', + [ $this, 'replaceTextCallback' ], + $text ); + + return $text; + } + + /** + * Callback for replaceText() + * + * @param array $matches + * @return string + * @private + */ + public function replaceTextCallback( $matches ) { + $type = $matches[1]; + $key = $matches[2]; + if ( $type == 'LINK' ) { + list( $ns, $index ) = explode( ':', $key, 2 ); + if ( isset( $this->internals[$ns][$index]['text'] ) ) { + return $this->internals[$ns][$index]['text']; + } + } elseif ( $type == 'IWLINK' ) { + if ( isset( $this->interwikis[$key]['text'] ) ) { + return $this->interwikis[$key]['text']; + } + } + return $matches[0]; + } +} diff --git a/www/wiki/includes/parser/MWTidy.php b/www/wiki/includes/parser/MWTidy.php new file mode 100644 index 00000000..19cf5731 --- /dev/null +++ b/www/wiki/includes/parser/MWTidy.php @@ -0,0 +1,145 @@ +<?php +/** + * HTML validation and correction + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ + +/** + * Class to interact with HTML tidy + * + * Either the external tidy program or the in-process tidy extension + * will be used depending on availability. Override the default + * $wgTidyInternal setting to disable the internal if it's not working. + * + * @ingroup Parser + */ +class MWTidy { + private static $instance; + + /** + * Interface with html tidy. + * If tidy isn't able to correct the markup, the original will be + * returned in all its glory with a warning comment appended. + * + * @param string $text HTML input fragment. This should not contain a + * <body> or <html> tag. + * @return string Corrected HTML output + * @throws MWException + */ + public static function tidy( $text ) { + $driver = self::singleton(); + if ( !$driver ) { + throw new MWException( __METHOD__ . + ': tidy is disabled, caller should have checked MWTidy::isEnabled()' ); + } + return $driver->tidy( $text ); + } + + /** + * @return bool + */ + public static function isEnabled() { + return self::singleton() !== false; + } + + /** + * @return bool|\MediaWiki\Tidy\TidyDriverBase + */ + public static function singleton() { + global $wgUseTidy, $wgTidyInternal, $wgTidyConf, $wgDebugTidy, $wgTidyConfig, + $wgTidyBin, $wgTidyOpts; + + if ( self::$instance === null ) { + if ( $wgTidyConfig !== null ) { + $config = $wgTidyConfig; + } elseif ( $wgUseTidy ) { + // b/c configuration + $config = [ + 'tidyConfigFile' => $wgTidyConf, + 'debugComment' => $wgDebugTidy, + 'tidyBin' => $wgTidyBin, + 'tidyCommandLine' => $wgTidyOpts ]; + if ( $wgTidyInternal ) { + if ( wfIsHHVM() ) { + $config['driver'] = 'RaggettInternalHHVM'; + } else { + $config['driver'] = 'RaggettInternalPHP'; + } + } else { + $config['driver'] = 'RaggettExternal'; + } + } else { + return false; + } + self::$instance = self::factory( $config ); + } + return self::$instance; + } + + /** + * Create a new Tidy driver object from configuration. + * @see $wgTidyConfig + * @param array $config + * @return bool|\MediaWiki\Tidy\TidyDriverBase + * @throws MWException + */ + public static function factory( array $config ) { + switch ( $config['driver'] ) { + case 'RaggettInternalHHVM': + $instance = new MediaWiki\Tidy\RaggettInternalHHVM( $config ); + break; + case 'RaggettInternalPHP': + $instance = new MediaWiki\Tidy\RaggettInternalPHP( $config ); + break; + case 'RaggettExternal': + $instance = new MediaWiki\Tidy\RaggettExternal( $config ); + break; + case 'Html5Depurate': + $instance = new MediaWiki\Tidy\Html5Depurate( $config ); + break; + case 'Html5Internal': + $instance = new MediaWiki\Tidy\Html5Internal( $config ); + break; + case 'RemexHtml': + $instance = new MediaWiki\Tidy\RemexDriver( $config ); + break; + case 'disabled': + return false; + default: + throw new MWException( "Invalid tidy driver: \"{$config['driver']}\"" ); + } + return $instance; + } + + /** + * Set the driver to be used. This is for testing. + * @param MediaWiki\Tidy\TidyDriverBase|false|null $instance + */ + public static function setInstance( $instance ) { + self::$instance = $instance; + } + + /** + * Destroy the current singleton instance + */ + public static function destroySingleton() { + self::$instance = null; + } +} diff --git a/www/wiki/includes/parser/Parser.php b/www/wiki/includes/parser/Parser.php new file mode 100644 index 00000000..b66031cc --- /dev/null +++ b/www/wiki/includes/parser/Parser.php @@ -0,0 +1,6181 @@ +<?php +/** + * PHP parser that converts wiki markup to HTML. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ +use MediaWiki\Linker\LinkRenderer; +use MediaWiki\MediaWikiServices; +use Wikimedia\ScopedCallback; + +/** + * @defgroup Parser Parser + */ + +/** + * PHP Parser - Processes wiki markup (which uses a more user-friendly + * syntax, such as "[[link]]" for making links), and provides a one-way + * transformation of that wiki markup it into (X)HTML output / markup + * (which in turn the browser understands, and can display). + * + * There are seven main entry points into the Parser class: + * + * - Parser::parse() + * produces HTML output + * - Parser::preSaveTransform() + * produces altered wiki markup + * - Parser::preprocess() + * removes HTML comments and expands templates + * - Parser::cleanSig() and Parser::cleanSigInSig() + * cleans a signature before saving it to preferences + * - Parser::getSection() + * return the content of a section from an article for section editing + * - Parser::replaceSection() + * replaces a section by number inside an article + * - Parser::getPreloadText() + * removes <noinclude> sections and <includeonly> tags + * + * Globals used: + * object: $wgContLang + * + * @warning $wgUser or $wgTitle or $wgRequest or $wgLang. Keep them away! + * + * @par Settings: + * $wgNamespacesWithSubpages + * + * @par Settings only within ParserOptions: + * $wgAllowExternalImages + * $wgAllowSpecialInclusion + * $wgInterwikiMagic + * $wgMaxArticleSize + * + * @ingroup Parser + */ +class Parser { + /** + * Update this version number when the ParserOutput format + * changes in an incompatible way, so the parser cache + * can automatically discard old data. + */ + const VERSION = '1.6.4'; + + /** + * Update this version number when the output of serialiseHalfParsedText() + * changes in an incompatible way + */ + const HALF_PARSED_VERSION = 2; + + # Flags for Parser::setFunctionHook + const SFH_NO_HASH = 1; + const SFH_OBJECT_ARGS = 2; + + # Constants needed for external link processing + # Everything except bracket, space, or control characters + # \p{Zs} is unicode 'separator, space' category. It covers the space 0x20 + # as well as U+3000 is IDEOGRAPHIC SPACE for T21052 + # \x{FFFD} is the Unicode replacement character, which Preprocessor_DOM + # uses to replace invalid HTML characters. + const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}]'; + # Simplified expression to match an IPv4 or IPv6 address, or + # at least one character of a host name (embeds EXT_LINK_URL_CLASS) + const EXT_LINK_ADDR = '(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}])'; + # RegExp to make image URLs (embeds IPv6 part of EXT_LINK_ADDR) + // phpcs:ignore Generic.Files.LineLength + const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)((?:\\[(?i:[0-9a-f:.]+)\\])?[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}]+) + \\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sxu'; + + # Regular expression for a non-newline space + const SPACE_NOT_NL = '(?:\t| |&\#0*160;|&\#[Xx]0*[Aa]0;|\p{Zs})'; + + # Flags for preprocessToDom + const PTD_FOR_INCLUSION = 1; + + # Allowed values for $this->mOutputType + # Parameter to startExternalParse(). + const OT_HTML = 1; # like parse() + const OT_WIKI = 2; # like preSaveTransform() + const OT_PREPROCESS = 3; # like preprocess() + const OT_MSG = 3; + const OT_PLAIN = 4; # like extractSections() - portions of the original are returned unchanged. + + /** + * @var string Prefix and suffix for temporary replacement strings + * for the multipass parser. + * + * \x7f should never appear in input as it's disallowed in XML. + * Using it at the front also gives us a little extra robustness + * since it shouldn't match when butted up against identifier-like + * string constructs. + * + * Must not consist of all title characters, or else it will change + * the behavior of <nowiki> in a link. + * + * Must have a character that needs escaping in attributes, otherwise + * someone could put a strip marker in an attribute, to get around + * escaping quote marks, and break out of the attribute. Thus we add + * `'". + */ + const MARKER_SUFFIX = "-QINU`\"'\x7f"; + const MARKER_PREFIX = "\x7f'\"`UNIQ-"; + + # Markers used for wrapping the table of contents + const TOC_START = '<mw:toc>'; + const TOC_END = '</mw:toc>'; + + # Persistent: + public $mTagHooks = []; + public $mTransparentTagHooks = []; + public $mFunctionHooks = []; + public $mFunctionSynonyms = [ 0 => [], 1 => [] ]; + public $mFunctionTagHooks = []; + public $mStripList = []; + public $mDefaultStripList = []; + public $mVarCache = []; + public $mImageParams = []; + public $mImageParamsMagicArray = []; + public $mMarkerIndex = 0; + public $mFirstCall = true; + + # Initialised by initialiseVariables() + + /** + * @var MagicWordArray + */ + public $mVariables; + + /** + * @var MagicWordArray + */ + public $mSubstWords; + # Initialised in constructor + public $mConf, $mExtLinkBracketedRegex, $mUrlProtocols; + + # Initialized in getPreprocessor() + /** @var Preprocessor */ + public $mPreprocessor; + + # Cleared with clearState(): + /** + * @var ParserOutput + */ + public $mOutput; + public $mAutonumber; + + /** + * @var StripState + */ + public $mStripState; + + public $mIncludeCount; + /** + * @var LinkHolderArray + */ + public $mLinkHolders; + + public $mLinkID; + public $mIncludeSizes, $mPPNodeCount, $mGeneratedPPNodeCount, $mHighestExpansionDepth; + public $mDefaultSort; + public $mTplRedirCache, $mTplDomCache, $mHeadings, $mDoubleUnderscores; + public $mExpensiveFunctionCount; # number of expensive parser function calls + public $mShowToc, $mForceTocPosition; + + /** + * @var User + */ + public $mUser; # User object; only used when doing pre-save transform + + # Temporary + # These are variables reset at least once per parse regardless of $clearState + + /** + * @var ParserOptions + */ + public $mOptions; + + /** + * @var Title + */ + public $mTitle; # Title context, used for self-link rendering and similar things + public $mOutputType; # Output type, one of the OT_xxx constants + public $ot; # Shortcut alias, see setOutputType() + public $mRevisionObject; # The revision object of the specified revision ID + public $mRevisionId; # ID to display in {{REVISIONID}} tags + public $mRevisionTimestamp; # The timestamp of the specified revision ID + public $mRevisionUser; # User to display in {{REVISIONUSER}} tag + public $mRevisionSize; # Size to display in {{REVISIONSIZE}} variable + public $mRevIdForTs; # The revision ID which was used to fetch the timestamp + public $mInputSize = false; # For {{PAGESIZE}} on current page. + + /** + * @var string Deprecated accessor for the strip marker prefix. + * @deprecated since 1.26; use Parser::MARKER_PREFIX instead. + */ + public $mUniqPrefix = self::MARKER_PREFIX; + + /** + * @var array Array with the language name of each language link (i.e. the + * interwiki prefix) in the key, value arbitrary. Used to avoid sending + * duplicate language links to the ParserOutput. + */ + public $mLangLinkLanguages; + + /** + * @var MapCacheLRU|null + * @since 1.24 + * + * A cache of the current revisions of titles. Keys are $title->getPrefixedDbKey() + */ + public $currentRevisionCache; + + /** + * @var bool|string Recursive call protection. + * This variable should be treated as if it were private. + */ + public $mInParse = false; + + /** @var SectionProfiler */ + protected $mProfiler; + + /** + * @var LinkRenderer + */ + protected $mLinkRenderer; + + /** + * @param array $conf + */ + public function __construct( $conf = [] ) { + $this->mConf = $conf; + $this->mUrlProtocols = wfUrlProtocols(); + $this->mExtLinkBracketedRegex = '/\[(((?i)' . $this->mUrlProtocols . ')' . + self::EXT_LINK_ADDR . + self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F\\x{FFFD}]*?)\]/Su'; + if ( isset( $conf['preprocessorClass'] ) ) { + $this->mPreprocessorClass = $conf['preprocessorClass']; + } elseif ( defined( 'HPHP_VERSION' ) ) { + # Preprocessor_Hash is much faster than Preprocessor_DOM under HipHop + $this->mPreprocessorClass = Preprocessor_Hash::class; + } elseif ( extension_loaded( 'domxml' ) ) { + # PECL extension that conflicts with the core DOM extension (T15770) + wfDebug( "Warning: you have the obsolete domxml extension for PHP. Please remove it!\n" ); + $this->mPreprocessorClass = Preprocessor_Hash::class; + } elseif ( extension_loaded( 'dom' ) ) { + $this->mPreprocessorClass = Preprocessor_DOM::class; + } else { + $this->mPreprocessorClass = Preprocessor_Hash::class; + } + wfDebug( __CLASS__ . ": using preprocessor: {$this->mPreprocessorClass}\n" ); + } + + /** + * Reduce memory usage to reduce the impact of circular references + */ + public function __destruct() { + if ( isset( $this->mLinkHolders ) ) { + unset( $this->mLinkHolders ); + } + foreach ( $this as $name => $value ) { + unset( $this->$name ); + } + } + + /** + * Allow extensions to clean up when the parser is cloned + */ + public function __clone() { + $this->mInParse = false; + + // T58226: When you create a reference "to" an object field, that + // makes the object field itself be a reference too (until the other + // reference goes out of scope). When cloning, any field that's a + // reference is copied as a reference in the new object. Both of these + // are defined PHP5 behaviors, as inconvenient as it is for us when old + // hooks from PHP4 days are passing fields by reference. + foreach ( [ 'mStripState', 'mVarCache' ] as $k ) { + // Make a non-reference copy of the field, then rebind the field to + // reference the new copy. + $tmp = $this->$k; + $this->$k =& $tmp; + unset( $tmp ); + } + + Hooks::run( 'ParserCloned', [ $this ] ); + } + + /** + * Do various kinds of initialisation on the first call of the parser + */ + public function firstCallInit() { + if ( !$this->mFirstCall ) { + return; + } + $this->mFirstCall = false; + + CoreParserFunctions::register( $this ); + CoreTagHooks::register( $this ); + $this->initialiseVariables(); + + // Avoid PHP 7.1 warning from passing $this by reference + $parser = $this; + Hooks::run( 'ParserFirstCallInit', [ &$parser ] ); + } + + /** + * Clear Parser state + * + * @private + */ + public function clearState() { + if ( $this->mFirstCall ) { + $this->firstCallInit(); + } + $this->mOutput = new ParserOutput; + $this->mOptions->registerWatcher( [ $this->mOutput, 'recordOption' ] ); + $this->mAutonumber = 0; + $this->mIncludeCount = []; + $this->mLinkHolders = new LinkHolderArray( $this ); + $this->mLinkID = 0; + $this->mRevisionObject = $this->mRevisionTimestamp = + $this->mRevisionId = $this->mRevisionUser = $this->mRevisionSize = null; + $this->mVarCache = []; + $this->mUser = null; + $this->mLangLinkLanguages = []; + $this->currentRevisionCache = null; + + $this->mStripState = new StripState( $this ); + + # Clear these on every parse, T6549 + $this->mTplRedirCache = $this->mTplDomCache = []; + + $this->mShowToc = true; + $this->mForceTocPosition = false; + $this->mIncludeSizes = [ + 'post-expand' => 0, + 'arg' => 0, + ]; + $this->mPPNodeCount = 0; + $this->mGeneratedPPNodeCount = 0; + $this->mHighestExpansionDepth = 0; + $this->mDefaultSort = false; + $this->mHeadings = []; + $this->mDoubleUnderscores = []; + $this->mExpensiveFunctionCount = 0; + + # Fix cloning + if ( isset( $this->mPreprocessor ) && $this->mPreprocessor->parser !== $this ) { + $this->mPreprocessor = null; + } + + $this->mProfiler = new SectionProfiler(); + + // Avoid PHP 7.1 warning from passing $this by reference + $parser = $this; + Hooks::run( 'ParserClearState', [ &$parser ] ); + } + + /** + * Convert wikitext to HTML + * Do not call this function recursively. + * + * @param string $text Text we want to parse + * @param Title $title + * @param ParserOptions $options + * @param bool $linestart + * @param bool $clearState + * @param int $revid Number to pass in {{REVISIONID}} + * @return ParserOutput A ParserOutput + */ + public function parse( + $text, Title $title, ParserOptions $options, + $linestart = true, $clearState = true, $revid = null + ) { + if ( $clearState ) { + // We use U+007F DELETE to construct strip markers, so we have to make + // sure that this character does not occur in the input text. + $text = strtr( $text, "\x7f", "?" ); + $magicScopeVariable = $this->lock(); + } + // Strip U+0000 NULL (T159174) + $text = str_replace( "\000", '', $text ); + + $this->startParse( $title, $options, self::OT_HTML, $clearState ); + + $this->currentRevisionCache = null; + $this->mInputSize = strlen( $text ); + if ( $this->mOptions->getEnableLimitReport() ) { + $this->mOutput->resetParseStartTime(); + } + + $oldRevisionId = $this->mRevisionId; + $oldRevisionObject = $this->mRevisionObject; + $oldRevisionTimestamp = $this->mRevisionTimestamp; + $oldRevisionUser = $this->mRevisionUser; + $oldRevisionSize = $this->mRevisionSize; + if ( $revid !== null ) { + $this->mRevisionId = $revid; + $this->mRevisionObject = null; + $this->mRevisionTimestamp = null; + $this->mRevisionUser = null; + $this->mRevisionSize = null; + } + + // Avoid PHP 7.1 warning from passing $this by reference + $parser = $this; + Hooks::run( 'ParserBeforeStrip', [ &$parser, &$text, &$this->mStripState ] ); + # No more strip! + Hooks::run( 'ParserAfterStrip', [ &$parser, &$text, &$this->mStripState ] ); + $text = $this->internalParse( $text ); + Hooks::run( 'ParserAfterParse', [ &$parser, &$text, &$this->mStripState ] ); + + $text = $this->internalParseHalfParsed( $text, true, $linestart ); + + /** + * A converted title will be provided in the output object if title and + * content conversion are enabled, the article text does not contain + * a conversion-suppressing double-underscore tag, and no + * {{DISPLAYTITLE:...}} is present. DISPLAYTITLE takes precedence over + * automatic link conversion. + */ + if ( !( $options->getDisableTitleConversion() + || isset( $this->mDoubleUnderscores['nocontentconvert'] ) + || isset( $this->mDoubleUnderscores['notitleconvert'] ) + || $this->mOutput->getDisplayTitle() !== false ) + ) { + $convruletitle = $this->getConverterLanguage()->getConvRuleTitle(); + if ( $convruletitle ) { + $this->mOutput->setTitleText( $convruletitle ); + } else { + $titleText = $this->getConverterLanguage()->convertTitle( $title ); + $this->mOutput->setTitleText( $titleText ); + } + } + + # Compute runtime adaptive expiry if set + $this->mOutput->finalizeAdaptiveCacheExpiry(); + + # Warn if too many heavyweight parser functions were used + if ( $this->mExpensiveFunctionCount > $this->mOptions->getExpensiveParserFunctionLimit() ) { + $this->limitationWarn( 'expensive-parserfunction', + $this->mExpensiveFunctionCount, + $this->mOptions->getExpensiveParserFunctionLimit() + ); + } + + # Information on limits, for the benefit of users who try to skirt them + if ( $this->mOptions->getEnableLimitReport() ) { + $text .= $this->makeLimitReport(); + } + + # Wrap non-interface parser output in a <div> so it can be targeted + # with CSS (T37247) + $class = $this->mOptions->getWrapOutputClass(); + if ( $class !== false && !$this->mOptions->getInterfaceMessage() ) { + $text = Html::rawElement( 'div', [ 'class' => $class ], $text ); + } + + $this->mOutput->setText( $text ); + + $this->mRevisionId = $oldRevisionId; + $this->mRevisionObject = $oldRevisionObject; + $this->mRevisionTimestamp = $oldRevisionTimestamp; + $this->mRevisionUser = $oldRevisionUser; + $this->mRevisionSize = $oldRevisionSize; + $this->mInputSize = false; + $this->currentRevisionCache = null; + + return $this->mOutput; + } + + /** + * Set the limit report data in the current ParserOutput, and return the + * limit report HTML comment. + * + * @return string + */ + protected function makeLimitReport() { + global $wgShowHostnames; + + $maxIncludeSize = $this->mOptions->getMaxIncludeSize(); + + $cpuTime = $this->mOutput->getTimeSinceStart( 'cpu' ); + if ( $cpuTime !== null ) { + $this->mOutput->setLimitReportData( 'limitreport-cputime', + sprintf( "%.3f", $cpuTime ) + ); + } + + $wallTime = $this->mOutput->getTimeSinceStart( 'wall' ); + $this->mOutput->setLimitReportData( 'limitreport-walltime', + sprintf( "%.3f", $wallTime ) + ); + + $this->mOutput->setLimitReportData( 'limitreport-ppvisitednodes', + [ $this->mPPNodeCount, $this->mOptions->getMaxPPNodeCount() ] + ); + $this->mOutput->setLimitReportData( 'limitreport-ppgeneratednodes', + [ $this->mGeneratedPPNodeCount, $this->mOptions->getMaxGeneratedPPNodeCount() ] + ); + $this->mOutput->setLimitReportData( 'limitreport-postexpandincludesize', + [ $this->mIncludeSizes['post-expand'], $maxIncludeSize ] + ); + $this->mOutput->setLimitReportData( 'limitreport-templateargumentsize', + [ $this->mIncludeSizes['arg'], $maxIncludeSize ] + ); + $this->mOutput->setLimitReportData( 'limitreport-expansiondepth', + [ $this->mHighestExpansionDepth, $this->mOptions->getMaxPPExpandDepth() ] + ); + $this->mOutput->setLimitReportData( 'limitreport-expensivefunctioncount', + [ $this->mExpensiveFunctionCount, $this->mOptions->getExpensiveParserFunctionLimit() ] + ); + + foreach ( $this->mStripState->getLimitReport() as list( $key, $value ) ) { + $this->mOutput->setLimitReportData( $key, $value ); + } + + Hooks::run( 'ParserLimitReportPrepare', [ $this, $this->mOutput ] ); + + $limitReport = "NewPP limit report\n"; + if ( $wgShowHostnames ) { + $limitReport .= 'Parsed by ' . wfHostname() . "\n"; + } + $limitReport .= 'Cached time: ' . $this->mOutput->getCacheTime() . "\n"; + $limitReport .= 'Cache expiry: ' . $this->mOutput->getCacheExpiry() . "\n"; + $limitReport .= 'Dynamic content: ' . + ( $this->mOutput->hasDynamicContent() ? 'true' : 'false' ) . + "\n"; + + foreach ( $this->mOutput->getLimitReportData() as $key => $value ) { + if ( Hooks::run( 'ParserLimitReportFormat', + [ $key, &$value, &$limitReport, false, false ] + ) ) { + $keyMsg = wfMessage( $key )->inLanguage( 'en' )->useDatabase( false ); + $valueMsg = wfMessage( [ "$key-value-text", "$key-value" ] ) + ->inLanguage( 'en' )->useDatabase( false ); + if ( !$valueMsg->exists() ) { + $valueMsg = new RawMessage( '$1' ); + } + if ( !$keyMsg->isDisabled() && !$valueMsg->isDisabled() ) { + $valueMsg->params( $value ); + $limitReport .= "{$keyMsg->text()}: {$valueMsg->text()}\n"; + } + } + } + // Since we're not really outputting HTML, decode the entities and + // then re-encode the things that need hiding inside HTML comments. + $limitReport = htmlspecialchars_decode( $limitReport ); + // Run deprecated hook + Hooks::run( 'ParserLimitReport', [ $this, &$limitReport ], '1.22' ); + + // Sanitize for comment. Note '‐' in the replacement is U+2010, + // which looks much like the problematic '-'. + $limitReport = str_replace( [ '-', '&' ], [ '‐', '&' ], $limitReport ); + $text = "\n<!-- \n$limitReport-->\n"; + + // Add on template profiling data in human/machine readable way + $dataByFunc = $this->mProfiler->getFunctionStats(); + uasort( $dataByFunc, function ( $a, $b ) { + return $a['real'] < $b['real']; // descending order + } ); + $profileReport = []; + foreach ( array_slice( $dataByFunc, 0, 10 ) as $item ) { + $profileReport[] = sprintf( "%6.2f%% %8.3f %6d %s", + $item['%real'], $item['real'], $item['calls'], + htmlspecialchars( $item['name'] ) ); + } + $text .= "<!--\nTransclusion expansion time report (%,ms,calls,template)\n"; + $text .= implode( "\n", $profileReport ) . "\n-->\n"; + + $this->mOutput->setLimitReportData( 'limitreport-timingprofile', $profileReport ); + + // Add other cache related metadata + if ( $wgShowHostnames ) { + $this->mOutput->setLimitReportData( 'cachereport-origin', wfHostname() ); + } + $this->mOutput->setLimitReportData( 'cachereport-timestamp', + $this->mOutput->getCacheTime() ); + $this->mOutput->setLimitReportData( 'cachereport-ttl', + $this->mOutput->getCacheExpiry() ); + $this->mOutput->setLimitReportData( 'cachereport-transientcontent', + $this->mOutput->hasDynamicContent() ); + + if ( $this->mGeneratedPPNodeCount > $this->mOptions->getMaxGeneratedPPNodeCount() / 10 ) { + wfDebugLog( 'generated-pp-node-count', $this->mGeneratedPPNodeCount . ' ' . + $this->mTitle->getPrefixedDBkey() ); + } + return $text; + } + + /** + * Half-parse wikitext to half-parsed HTML. This recursive parser entry point + * can be called from an extension tag hook. + * + * The output of this function IS NOT SAFE PARSED HTML; it is "half-parsed" + * instead, which means that lists and links have not been fully parsed yet, + * and strip markers are still present. + * + * Use recursiveTagParseFully() to fully parse wikitext to output-safe HTML. + * + * Use this function if you're a parser tag hook and you want to parse + * wikitext before or after applying additional transformations, and you + * intend to *return the result as hook output*, which will cause it to go + * through the rest of parsing process automatically. + * + * If $frame is not provided, then template variables (e.g., {{{1}}}) within + * $text are not expanded + * + * @param string $text Text extension wants to have parsed + * @param bool|PPFrame $frame The frame to use for expanding any template variables + * @return string UNSAFE half-parsed HTML + */ + public function recursiveTagParse( $text, $frame = false ) { + // Avoid PHP 7.1 warning from passing $this by reference + $parser = $this; + Hooks::run( 'ParserBeforeStrip', [ &$parser, &$text, &$this->mStripState ] ); + Hooks::run( 'ParserAfterStrip', [ &$parser, &$text, &$this->mStripState ] ); + $text = $this->internalParse( $text, false, $frame ); + return $text; + } + + /** + * Fully parse wikitext to fully parsed HTML. This recursive parser entry + * point can be called from an extension tag hook. + * + * The output of this function is fully-parsed HTML that is safe for output. + * If you're a parser tag hook, you might want to use recursiveTagParse() + * instead. + * + * If $frame is not provided, then template variables (e.g., {{{1}}}) within + * $text are not expanded + * + * @since 1.25 + * + * @param string $text Text extension wants to have parsed + * @param bool|PPFrame $frame The frame to use for expanding any template variables + * @return string Fully parsed HTML + */ + public function recursiveTagParseFully( $text, $frame = false ) { + $text = $this->recursiveTagParse( $text, $frame ); + $text = $this->internalParseHalfParsed( $text, false ); + return $text; + } + + /** + * Expand templates and variables in the text, producing valid, static wikitext. + * Also removes comments. + * Do not call this function recursively. + * @param string $text + * @param Title $title + * @param ParserOptions $options + * @param int|null $revid + * @param bool|PPFrame $frame + * @return mixed|string + */ + public function preprocess( $text, Title $title = null, + ParserOptions $options, $revid = null, $frame = false + ) { + $magicScopeVariable = $this->lock(); + $this->startParse( $title, $options, self::OT_PREPROCESS, true ); + if ( $revid !== null ) { + $this->mRevisionId = $revid; + } + // Avoid PHP 7.1 warning from passing $this by reference + $parser = $this; + Hooks::run( 'ParserBeforeStrip', [ &$parser, &$text, &$this->mStripState ] ); + Hooks::run( 'ParserAfterStrip', [ &$parser, &$text, &$this->mStripState ] ); + $text = $this->replaceVariables( $text, $frame ); + $text = $this->mStripState->unstripBoth( $text ); + return $text; + } + + /** + * Recursive parser entry point that can be called from an extension tag + * hook. + * + * @param string $text Text to be expanded + * @param bool|PPFrame $frame The frame to use for expanding any template variables + * @return string + * @since 1.19 + */ + public function recursivePreprocess( $text, $frame = false ) { + $text = $this->replaceVariables( $text, $frame ); + $text = $this->mStripState->unstripBoth( $text ); + return $text; + } + + /** + * Process the wikitext for the "?preload=" feature. (T7210) + * + * "<noinclude>", "<includeonly>" etc. are parsed as for template + * transclusion, comments, templates, arguments, tags hooks and parser + * functions are untouched. + * + * @param string $text + * @param Title $title + * @param ParserOptions $options + * @param array $params + * @return string + */ + public function getPreloadText( $text, Title $title, ParserOptions $options, $params = [] ) { + $msg = new RawMessage( $text ); + $text = $msg->params( $params )->plain(); + + # Parser (re)initialisation + $magicScopeVariable = $this->lock(); + $this->startParse( $title, $options, self::OT_PLAIN, true ); + + $flags = PPFrame::NO_ARGS | PPFrame::NO_TEMPLATES; + $dom = $this->preprocessToDom( $text, self::PTD_FOR_INCLUSION ); + $text = $this->getPreprocessor()->newFrame()->expand( $dom, $flags ); + $text = $this->mStripState->unstripBoth( $text ); + return $text; + } + + /** + * Set the current user. + * Should only be used when doing pre-save transform. + * + * @param User|null $user User object or null (to reset) + */ + public function setUser( $user ) { + $this->mUser = $user; + } + + /** + * Set the context title + * + * @param Title $t + */ + public function setTitle( $t ) { + if ( !$t ) { + $t = Title::newFromText( 'NO TITLE' ); + } + + if ( $t->hasFragment() ) { + # Strip the fragment to avoid various odd effects + $this->mTitle = $t->createFragmentTarget( '' ); + } else { + $this->mTitle = $t; + } + } + + /** + * Accessor for the Title object + * + * @return Title + */ + public function getTitle() { + return $this->mTitle; + } + + /** + * Accessor/mutator for the Title object + * + * @param Title $x Title object or null to just get the current one + * @return Title + */ + public function Title( $x = null ) { + return wfSetVar( $this->mTitle, $x ); + } + + /** + * Set the output type + * + * @param int $ot New value + */ + public function setOutputType( $ot ) { + $this->mOutputType = $ot; + # Shortcut alias + $this->ot = [ + 'html' => $ot == self::OT_HTML, + 'wiki' => $ot == self::OT_WIKI, + 'pre' => $ot == self::OT_PREPROCESS, + 'plain' => $ot == self::OT_PLAIN, + ]; + } + + /** + * Accessor/mutator for the output type + * + * @param int|null $x New value or null to just get the current one + * @return int + */ + public function OutputType( $x = null ) { + return wfSetVar( $this->mOutputType, $x ); + } + + /** + * Get the ParserOutput object + * + * @return ParserOutput + */ + public function getOutput() { + return $this->mOutput; + } + + /** + * Get the ParserOptions object + * + * @return ParserOptions + */ + public function getOptions() { + return $this->mOptions; + } + + /** + * Accessor/mutator for the ParserOptions object + * + * @param ParserOptions $x New value or null to just get the current one + * @return ParserOptions Current ParserOptions object + */ + public function Options( $x = null ) { + return wfSetVar( $this->mOptions, $x ); + } + + /** + * @return int + */ + public function nextLinkID() { + return $this->mLinkID++; + } + + /** + * @param int $id + */ + public function setLinkID( $id ) { + $this->mLinkID = $id; + } + + /** + * Get a language object for use in parser functions such as {{FORMATNUM:}} + * @return Language + */ + public function getFunctionLang() { + return $this->getTargetLanguage(); + } + + /** + * Get the target language for the content being parsed. This is usually the + * language that the content is in. + * + * @since 1.19 + * + * @throws MWException + * @return Language + */ + public function getTargetLanguage() { + $target = $this->mOptions->getTargetLanguage(); + + if ( $target !== null ) { + return $target; + } elseif ( $this->mOptions->getInterfaceMessage() ) { + return $this->mOptions->getUserLangObj(); + } elseif ( is_null( $this->mTitle ) ) { + throw new MWException( __METHOD__ . ': $this->mTitle is null' ); + } + + return $this->mTitle->getPageLanguage(); + } + + /** + * Get the language object for language conversion + * @return Language|null + */ + public function getConverterLanguage() { + return $this->getTargetLanguage(); + } + + /** + * Get a User object either from $this->mUser, if set, or from the + * ParserOptions object otherwise + * + * @return User + */ + public function getUser() { + if ( !is_null( $this->mUser ) ) { + return $this->mUser; + } + return $this->mOptions->getUser(); + } + + /** + * Get a preprocessor object + * + * @return Preprocessor + */ + public function getPreprocessor() { + if ( !isset( $this->mPreprocessor ) ) { + $class = $this->mPreprocessorClass; + $this->mPreprocessor = new $class( $this ); + } + return $this->mPreprocessor; + } + + /** + * Get a LinkRenderer instance to make links with + * + * @since 1.28 + * @return LinkRenderer + */ + public function getLinkRenderer() { + if ( !$this->mLinkRenderer ) { + $this->mLinkRenderer = MediaWikiServices::getInstance() + ->getLinkRendererFactory()->create(); + $this->mLinkRenderer->setStubThreshold( + $this->getOptions()->getStubThreshold() + ); + } + + return $this->mLinkRenderer; + } + + /** + * Replaces all occurrences of HTML-style comments and the given tags + * in the text with a random marker and returns the next text. The output + * parameter $matches will be an associative array filled with data in + * the form: + * + * @code + * 'UNIQ-xxxxx' => [ + * 'element', + * 'tag content', + * [ 'param' => 'x' ], + * '<element param="x">tag content</element>' ] + * @endcode + * + * @param array $elements List of element names. Comments are always extracted. + * @param string $text Source text string. + * @param array &$matches Out parameter, Array: extracted tags + * @return string Stripped text + */ + public static function extractTagsAndParams( $elements, $text, &$matches ) { + static $n = 1; + $stripped = ''; + $matches = []; + + $taglist = implode( '|', $elements ); + $start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?" . ">)|<(!--)/i"; + + while ( $text != '' ) { + $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE ); + $stripped .= $p[0]; + if ( count( $p ) < 5 ) { + break; + } + if ( count( $p ) > 5 ) { + # comment + $element = $p[4]; + $attributes = ''; + $close = ''; + $inside = $p[5]; + } else { + # tag + $element = $p[1]; + $attributes = $p[2]; + $close = $p[3]; + $inside = $p[4]; + } + + $marker = self::MARKER_PREFIX . "-$element-" . sprintf( '%08X', $n++ ) . self::MARKER_SUFFIX; + $stripped .= $marker; + + if ( $close === '/>' ) { + # Empty element tag, <tag /> + $content = null; + $text = $inside; + $tail = null; + } else { + if ( $element === '!--' ) { + $end = '/(-->)/'; + } else { + $end = "/(<\\/$element\\s*>)/i"; + } + $q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE ); + $content = $q[0]; + if ( count( $q ) < 3 ) { + # No end tag -- let it run out to the end of the text. + $tail = ''; + $text = ''; + } else { + $tail = $q[1]; + $text = $q[2]; + } + } + + $matches[$marker] = [ $element, + $content, + Sanitizer::decodeTagAttributes( $attributes ), + "<$element$attributes$close$content$tail" ]; + } + return $stripped; + } + + /** + * Get a list of strippable XML-like elements + * + * @return array + */ + public function getStripList() { + return $this->mStripList; + } + + /** + * Add an item to the strip state + * Returns the unique tag which must be inserted into the stripped text + * The tag will be replaced with the original text in unstrip() + * + * @param string $text + * + * @return string + */ + public function insertStripItem( $text ) { + $marker = self::MARKER_PREFIX . "-item-{$this->mMarkerIndex}-" . self::MARKER_SUFFIX; + $this->mMarkerIndex++; + $this->mStripState->addGeneral( $marker, $text ); + return $marker; + } + + /** + * parse the wiki syntax used to render tables + * + * @private + * @param string $text + * @return string + */ + public function doTableStuff( $text ) { + $lines = StringUtils::explode( "\n", $text ); + $out = ''; + $td_history = []; # Is currently a td tag open? + $last_tag_history = []; # Save history of last lag activated (td, th or caption) + $tr_history = []; # Is currently a tr tag open? + $tr_attributes = []; # history of tr attributes + $has_opened_tr = []; # Did this table open a <tr> element? + $indent_level = 0; # indent level of the table + + foreach ( $lines as $outLine ) { + $line = trim( $outLine ); + + if ( $line === '' ) { # empty line, go to next line + $out .= $outLine . "\n"; + continue; + } + + $first_character = $line[0]; + $first_two = substr( $line, 0, 2 ); + $matches = []; + + if ( preg_match( '/^(:*)\s*\{\|(.*)$/', $line, $matches ) ) { + # First check if we are starting a new table + $indent_level = strlen( $matches[1] ); + + $attributes = $this->mStripState->unstripBoth( $matches[2] ); + $attributes = Sanitizer::fixTagAttributes( $attributes, 'table' ); + + $outLine = str_repeat( '<dl><dd>', $indent_level ) . "<table{$attributes}>"; + array_push( $td_history, false ); + array_push( $last_tag_history, '' ); + array_push( $tr_history, false ); + array_push( $tr_attributes, '' ); + array_push( $has_opened_tr, false ); + } elseif ( count( $td_history ) == 0 ) { + # Don't do any of the following + $out .= $outLine . "\n"; + continue; + } elseif ( $first_two === '|}' ) { + # We are ending a table + $line = '</table>' . substr( $line, 2 ); + $last_tag = array_pop( $last_tag_history ); + + if ( !array_pop( $has_opened_tr ) ) { + $line = "<tr><td></td></tr>{$line}"; + } + + if ( array_pop( $tr_history ) ) { + $line = "</tr>{$line}"; + } + + if ( array_pop( $td_history ) ) { + $line = "</{$last_tag}>{$line}"; + } + array_pop( $tr_attributes ); + if ( $indent_level > 0 ) { + $outLine = rtrim( $line ) . str_repeat( '</dd></dl>', $indent_level ); + } else { + $outLine = $line; + } + } elseif ( $first_two === '|-' ) { + # Now we have a table row + $line = preg_replace( '#^\|-+#', '', $line ); + + # Whats after the tag is now only attributes + $attributes = $this->mStripState->unstripBoth( $line ); + $attributes = Sanitizer::fixTagAttributes( $attributes, 'tr' ); + array_pop( $tr_attributes ); + array_push( $tr_attributes, $attributes ); + + $line = ''; + $last_tag = array_pop( $last_tag_history ); + array_pop( $has_opened_tr ); + array_push( $has_opened_tr, true ); + + if ( array_pop( $tr_history ) ) { + $line = '</tr>'; + } + + if ( array_pop( $td_history ) ) { + $line = "</{$last_tag}>{$line}"; + } + + $outLine = $line; + array_push( $tr_history, false ); + array_push( $td_history, false ); + array_push( $last_tag_history, '' ); + } elseif ( $first_character === '|' + || $first_character === '!' + || $first_two === '|+' + ) { + # This might be cell elements, td, th or captions + if ( $first_two === '|+' ) { + $first_character = '+'; + $line = substr( $line, 2 ); + } else { + $line = substr( $line, 1 ); + } + + // Implies both are valid for table headings. + if ( $first_character === '!' ) { + $line = StringUtils::replaceMarkup( '!!', '||', $line ); + } + + # Split up multiple cells on the same line. + # FIXME : This can result in improper nesting of tags processed + # by earlier parser steps. + $cells = explode( '||', $line ); + + $outLine = ''; + + # Loop through each table cell + foreach ( $cells as $cell ) { + $previous = ''; + if ( $first_character !== '+' ) { + $tr_after = array_pop( $tr_attributes ); + if ( !array_pop( $tr_history ) ) { + $previous = "<tr{$tr_after}>\n"; + } + array_push( $tr_history, true ); + array_push( $tr_attributes, '' ); + array_pop( $has_opened_tr ); + array_push( $has_opened_tr, true ); + } + + $last_tag = array_pop( $last_tag_history ); + + if ( array_pop( $td_history ) ) { + $previous = "</{$last_tag}>\n{$previous}"; + } + + if ( $first_character === '|' ) { + $last_tag = 'td'; + } elseif ( $first_character === '!' ) { + $last_tag = 'th'; + } elseif ( $first_character === '+' ) { + $last_tag = 'caption'; + } else { + $last_tag = ''; + } + + array_push( $last_tag_history, $last_tag ); + + # A cell could contain both parameters and data + $cell_data = explode( '|', $cell, 2 ); + + # T2553: Note that a '|' inside an invalid link should not + # be mistaken as delimiting cell parameters + # Bug T153140: Neither should language converter markup. + if ( preg_match( '/\[\[|-\{/', $cell_data[0] ) === 1 ) { + $cell = "{$previous}<{$last_tag}>" . trim( $cell ); + } elseif ( count( $cell_data ) == 1 ) { + // Whitespace in cells is trimmed + $cell = "{$previous}<{$last_tag}>" . trim( $cell_data[0] ); + } else { + $attributes = $this->mStripState->unstripBoth( $cell_data[0] ); + $attributes = Sanitizer::fixTagAttributes( $attributes, $last_tag ); + // Whitespace in cells is trimmed + $cell = "{$previous}<{$last_tag}{$attributes}>" . trim( $cell_data[1] ); + } + + $outLine .= $cell; + array_push( $td_history, true ); + } + } + $out .= $outLine . "\n"; + } + + # Closing open td, tr && table + while ( count( $td_history ) > 0 ) { + if ( array_pop( $td_history ) ) { + $out .= "</td>\n"; + } + if ( array_pop( $tr_history ) ) { + $out .= "</tr>\n"; + } + if ( !array_pop( $has_opened_tr ) ) { + $out .= "<tr><td></td></tr>\n"; + } + + $out .= "</table>\n"; + } + + # Remove trailing line-ending (b/c) + if ( substr( $out, -1 ) === "\n" ) { + $out = substr( $out, 0, -1 ); + } + + # special case: don't return empty table + if ( $out === "<table>\n<tr><td></td></tr>\n</table>" ) { + $out = ''; + } + + return $out; + } + + /** + * Helper function for parse() that transforms wiki markup into half-parsed + * HTML. Only called for $mOutputType == self::OT_HTML. + * + * @private + * + * @param string $text The text to parse + * @param bool $isMain Whether this is being called from the main parse() function + * @param PPFrame|bool $frame A pre-processor frame + * + * @return string + */ + public function internalParse( $text, $isMain = true, $frame = false ) { + $origText = $text; + + // Avoid PHP 7.1 warning from passing $this by reference + $parser = $this; + + # Hook to suspend the parser in this state + if ( !Hooks::run( 'ParserBeforeInternalParse', [ &$parser, &$text, &$this->mStripState ] ) ) { + return $text; + } + + # if $frame is provided, then use $frame for replacing any variables + if ( $frame ) { + # use frame depth to infer how include/noinclude tags should be handled + # depth=0 means this is the top-level document; otherwise it's an included document + if ( !$frame->depth ) { + $flag = 0; + } else { + $flag = self::PTD_FOR_INCLUSION; + } + $dom = $this->preprocessToDom( $text, $flag ); + $text = $frame->expand( $dom ); + } else { + # if $frame is not provided, then use old-style replaceVariables + $text = $this->replaceVariables( $text ); + } + + Hooks::run( 'InternalParseBeforeSanitize', [ &$parser, &$text, &$this->mStripState ] ); + $text = Sanitizer::removeHTMLtags( + $text, + [ $this, 'attributeStripCallback' ], + false, + array_keys( $this->mTransparentTagHooks ), + [], + [ $this, 'addTrackingCategory' ] + ); + Hooks::run( 'InternalParseBeforeLinks', [ &$parser, &$text, &$this->mStripState ] ); + + # Tables need to come after variable replacement for things to work + # properly; putting them before other transformations should keep + # exciting things like link expansions from showing up in surprising + # places. + $text = $this->doTableStuff( $text ); + + $text = preg_replace( '/(^|\n)-----*/', '\\1<hr />', $text ); + + $text = $this->doDoubleUnderscore( $text ); + + $text = $this->doHeadings( $text ); + $text = $this->replaceInternalLinks( $text ); + $text = $this->doAllQuotes( $text ); + $text = $this->replaceExternalLinks( $text ); + + # replaceInternalLinks may sometimes leave behind + # absolute URLs, which have to be masked to hide them from replaceExternalLinks + $text = str_replace( self::MARKER_PREFIX . 'NOPARSE', '', $text ); + + $text = $this->doMagicLinks( $text ); + $text = $this->formatHeadings( $text, $origText, $isMain ); + + return $text; + } + + /** + * Helper function for parse() that transforms half-parsed HTML into fully + * parsed HTML. + * + * @param string $text + * @param bool $isMain + * @param bool $linestart + * @return string + */ + private function internalParseHalfParsed( $text, $isMain = true, $linestart = true ) { + $text = $this->mStripState->unstripGeneral( $text ); + + // Avoid PHP 7.1 warning from passing $this by reference + $parser = $this; + + if ( $isMain ) { + Hooks::run( 'ParserAfterUnstrip', [ &$parser, &$text ] ); + } + + # Clean up special characters, only run once, next-to-last before doBlockLevels + $fixtags = [ + # French spaces, last one Guillemet-left + # only if there is something before the space + '/(.) (?=\\?|:|;|!|%|\\302\\273)/' => '\\1 ', + # french spaces, Guillemet-right + '/(\\302\\253) /' => '\\1 ', + '/ (!\s*important)/' => ' \\1', # Beware of CSS magic word !important, T13874. + ]; + $text = preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text ); + + $text = $this->doBlockLevels( $text, $linestart ); + + $this->replaceLinkHolders( $text ); + + /** + * The input doesn't get language converted if + * a) It's disabled + * b) Content isn't converted + * c) It's a conversion table + * d) it is an interface message (which is in the user language) + */ + if ( !( $this->mOptions->getDisableContentConversion() + || isset( $this->mDoubleUnderscores['nocontentconvert'] ) ) + ) { + if ( !$this->mOptions->getInterfaceMessage() ) { + # The position of the convert() call should not be changed. it + # assumes that the links are all replaced and the only thing left + # is the <nowiki> mark. + $text = $this->getConverterLanguage()->convert( $text ); + } + } + + $text = $this->mStripState->unstripNoWiki( $text ); + + if ( $isMain ) { + Hooks::run( 'ParserBeforeTidy', [ &$parser, &$text ] ); + } + + $text = $this->replaceTransparentTags( $text ); + $text = $this->mStripState->unstripGeneral( $text ); + + $text = Sanitizer::normalizeCharReferences( $text ); + + if ( MWTidy::isEnabled() ) { + if ( $this->mOptions->getTidy() ) { + $text = MWTidy::tidy( $text ); + } + } else { + # attempt to sanitize at least some nesting problems + # (T4702 and quite a few others) + $tidyregs = [ + # ''Something [http://www.cool.com cool''] --> + # <i>Something</i><a href="http://www.cool.com"..><i>cool></i></a> + '/(<([bi])>)(<([bi])>)?([^<]*)(<\/?a[^<]*>)([^<]*)(<\/\\4>)?(<\/\\2>)/' => + '\\1\\3\\5\\8\\9\\6\\1\\3\\7\\8\\9', + # fix up an anchor inside another anchor, only + # at least for a single single nested link (T5695) + '/(<a[^>]+>)([^<]*)(<a[^>]+>[^<]*)<\/a>(.*)<\/a>/' => + '\\1\\2</a>\\3</a>\\1\\4</a>', + # fix div inside inline elements- doBlockLevels won't wrap a line which + # contains a div, so fix it up here; replace + # div with escaped text + '/(<([aib]) [^>]+>)([^<]*)(<div([^>]*)>)(.*)(<\/div>)([^<]*)(<\/\\2>)/' => + '\\1\\3<div\\5>\\6</div>\\8\\9', + # remove empty italic or bold tag pairs, some + # introduced by rules above + '/<([bi])><\/\\1>/' => '', + ]; + + $text = preg_replace( + array_keys( $tidyregs ), + array_values( $tidyregs ), + $text ); + } + + if ( $isMain ) { + Hooks::run( 'ParserAfterTidy', [ &$parser, &$text ] ); + } + + return $text; + } + + /** + * Replace special strings like "ISBN xxx" and "RFC xxx" with + * magic external links. + * + * DML + * @private + * + * @param string $text + * + * @return string + */ + public function doMagicLinks( $text ) { + $prots = wfUrlProtocolsWithoutProtRel(); + $urlChar = self::EXT_LINK_URL_CLASS; + $addr = self::EXT_LINK_ADDR; + $space = self::SPACE_NOT_NL; # non-newline space + $spdash = "(?:-|$space)"; # a dash or a non-newline space + $spaces = "$space++"; # possessive match of 1 or more spaces + $text = preg_replace_callback( + '!(?: # Start cases + (<a[ \t\r\n>].*?</a>) | # m[1]: Skip link text + (<.*?>) | # m[2]: Skip stuff inside HTML elements' . " + (\b # m[3]: Free external links + (?i:$prots) + ($addr$urlChar*) # m[4]: Post-protocol path + ) | + \b(?:RFC|PMID) $spaces # m[5]: RFC or PMID, capture number + ([0-9]+)\b | + \bISBN $spaces ( # m[6]: ISBN, capture number + (?: 97[89] $spdash? )? # optional 13-digit ISBN prefix + (?: [0-9] $spdash? ){9} # 9 digits with opt. delimiters + [0-9Xx] # check digit + )\b + )!xu", [ $this, 'magicLinkCallback' ], $text ); + return $text; + } + + /** + * @throws MWException + * @param array $m + * @return string HTML + */ + public function magicLinkCallback( $m ) { + if ( isset( $m[1] ) && $m[1] !== '' ) { + # Skip anchor + return $m[0]; + } elseif ( isset( $m[2] ) && $m[2] !== '' ) { + # Skip HTML element + return $m[0]; + } elseif ( isset( $m[3] ) && $m[3] !== '' ) { + # Free external link + return $this->makeFreeExternalLink( $m[0], strlen( $m[4] ) ); + } elseif ( isset( $m[5] ) && $m[5] !== '' ) { + # RFC or PMID + if ( substr( $m[0], 0, 3 ) === 'RFC' ) { + if ( !$this->mOptions->getMagicRFCLinks() ) { + return $m[0]; + } + $keyword = 'RFC'; + $urlmsg = 'rfcurl'; + $cssClass = 'mw-magiclink-rfc'; + $trackingCat = 'magiclink-tracking-rfc'; + $id = $m[5]; + } elseif ( substr( $m[0], 0, 4 ) === 'PMID' ) { + if ( !$this->mOptions->getMagicPMIDLinks() ) { + return $m[0]; + } + $keyword = 'PMID'; + $urlmsg = 'pubmedurl'; + $cssClass = 'mw-magiclink-pmid'; + $trackingCat = 'magiclink-tracking-pmid'; + $id = $m[5]; + } else { + throw new MWException( __METHOD__ . ': unrecognised match type "' . + substr( $m[0], 0, 20 ) . '"' ); + } + $url = wfMessage( $urlmsg, $id )->inContentLanguage()->text(); + $this->addTrackingCategory( $trackingCat ); + return Linker::makeExternalLink( $url, "{$keyword} {$id}", true, $cssClass, [], $this->mTitle ); + } elseif ( isset( $m[6] ) && $m[6] !== '' + && $this->mOptions->getMagicISBNLinks() + ) { + # ISBN + $isbn = $m[6]; + $space = self::SPACE_NOT_NL; # non-newline space + $isbn = preg_replace( "/$space/", ' ', $isbn ); + $num = strtr( $isbn, [ + '-' => '', + ' ' => '', + 'x' => 'X', + ] ); + $this->addTrackingCategory( 'magiclink-tracking-isbn' ); + return $this->getLinkRenderer()->makeKnownLink( + SpecialPage::getTitleFor( 'Booksources', $num ), + "ISBN $isbn", + [ + 'class' => 'internal mw-magiclink-isbn', + 'title' => false // suppress title attribute + ] + ); + } else { + return $m[0]; + } + } + + /** + * Make a free external link, given a user-supplied URL + * + * @param string $url + * @param int $numPostProto + * The number of characters after the protocol. + * @return string HTML + * @private + */ + public function makeFreeExternalLink( $url, $numPostProto ) { + $trail = ''; + + # The characters '<' and '>' (which were escaped by + # removeHTMLtags()) should not be included in + # URLs, per RFC 2396. + # Make terminate a URL as well (bug T84937) + $m2 = []; + if ( preg_match( + '/&(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));/', + $url, + $m2, + PREG_OFFSET_CAPTURE + ) ) { + $trail = substr( $url, $m2[0][1] ) . $trail; + $url = substr( $url, 0, $m2[0][1] ); + } + + # Move trailing punctuation to $trail + $sep = ',;\.:!?'; + # If there is no left bracket, then consider right brackets fair game too + if ( strpos( $url, '(' ) === false ) { + $sep .= ')'; + } + + $urlRev = strrev( $url ); + $numSepChars = strspn( $urlRev, $sep ); + # Don't break a trailing HTML entity by moving the ; into $trail + # This is in hot code, so use substr_compare to avoid having to + # create a new string object for the comparison + if ( $numSepChars && substr_compare( $url, ";", -$numSepChars, 1 ) === 0 ) { + # more optimization: instead of running preg_match with a $ + # anchor, which can be slow, do the match on the reversed + # string starting at the desired offset. + # un-reversed regexp is: /&([a-z]+|#x[\da-f]+|#\d+)$/i + if ( preg_match( '/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, $numSepChars ) ) { + $numSepChars--; + } + } + if ( $numSepChars ) { + $trail = substr( $url, -$numSepChars ) . $trail; + $url = substr( $url, 0, -$numSepChars ); + } + + # Verify that we still have a real URL after trail removal, and + # not just lone protocol + if ( strlen( $trail ) >= $numPostProto ) { + return $url . $trail; + } + + $url = Sanitizer::cleanUrl( $url ); + + # Is this an external image? + $text = $this->maybeMakeExternalImage( $url ); + if ( $text === false ) { + # Not an image, make a link + $text = Linker::makeExternalLink( $url, + $this->getConverterLanguage()->markNoConversion( $url, true ), + true, 'free', + $this->getExternalLinkAttribs( $url ), $this->mTitle ); + # Register it in the output object... + $this->mOutput->addExternalLink( $url ); + } + return $text . $trail; + } + + /** + * Parse headers and return html + * + * @private + * + * @param string $text + * + * @return string + */ + public function doHeadings( $text ) { + for ( $i = 6; $i >= 1; --$i ) { + $h = str_repeat( '=', $i ); + // Trim non-newline whitespace from headings + // Using \s* will break for: "==\n===\n" and parse as <h2>=</h2> + $text = preg_replace( "/^(?:$h)[ \\t]*(.+?)[ \\t]*(?:$h)\\s*$/m", "<h$i>\\1</h$i>", $text ); + } + return $text; + } + + /** + * Replace single quotes with HTML markup + * @private + * + * @param string $text + * + * @return string The altered text + */ + public function doAllQuotes( $text ) { + $outtext = ''; + $lines = StringUtils::explode( "\n", $text ); + foreach ( $lines as $line ) { + $outtext .= $this->doQuotes( $line ) . "\n"; + } + $outtext = substr( $outtext, 0, -1 ); + return $outtext; + } + + /** + * Helper function for doAllQuotes() + * + * @param string $text + * + * @return string + */ + public function doQuotes( $text ) { + $arr = preg_split( "/(''+)/", $text, -1, PREG_SPLIT_DELIM_CAPTURE ); + $countarr = count( $arr ); + if ( $countarr == 1 ) { + return $text; + } + + // First, do some preliminary work. This may shift some apostrophes from + // being mark-up to being text. It also counts the number of occurrences + // of bold and italics mark-ups. + $numbold = 0; + $numitalics = 0; + for ( $i = 1; $i < $countarr; $i += 2 ) { + $thislen = strlen( $arr[$i] ); + // If there are ever four apostrophes, assume the first is supposed to + // be text, and the remaining three constitute mark-up for bold text. + // (T15227: ''''foo'''' turns into ' ''' foo ' ''') + if ( $thislen == 4 ) { + $arr[$i - 1] .= "'"; + $arr[$i] = "'''"; + $thislen = 3; + } elseif ( $thislen > 5 ) { + // If there are more than 5 apostrophes in a row, assume they're all + // text except for the last 5. + // (T15227: ''''''foo'''''' turns into ' ''''' foo ' ''''') + $arr[$i - 1] .= str_repeat( "'", $thislen - 5 ); + $arr[$i] = "'''''"; + $thislen = 5; + } + // Count the number of occurrences of bold and italics mark-ups. + if ( $thislen == 2 ) { + $numitalics++; + } elseif ( $thislen == 3 ) { + $numbold++; + } elseif ( $thislen == 5 ) { + $numitalics++; + $numbold++; + } + } + + // If there is an odd number of both bold and italics, it is likely + // that one of the bold ones was meant to be an apostrophe followed + // by italics. Which one we cannot know for certain, but it is more + // likely to be one that has a single-letter word before it. + if ( ( $numbold % 2 == 1 ) && ( $numitalics % 2 == 1 ) ) { + $firstsingleletterword = -1; + $firstmultiletterword = -1; + $firstspace = -1; + for ( $i = 1; $i < $countarr; $i += 2 ) { + if ( strlen( $arr[$i] ) == 3 ) { + $x1 = substr( $arr[$i - 1], -1 ); + $x2 = substr( $arr[$i - 1], -2, 1 ); + if ( $x1 === ' ' ) { + if ( $firstspace == -1 ) { + $firstspace = $i; + } + } elseif ( $x2 === ' ' ) { + $firstsingleletterword = $i; + // if $firstsingleletterword is set, we don't + // look at the other options, so we can bail early. + break; + } else { + if ( $firstmultiletterword == -1 ) { + $firstmultiletterword = $i; + } + } + } + } + + // If there is a single-letter word, use it! + if ( $firstsingleletterword > -1 ) { + $arr[$firstsingleletterword] = "''"; + $arr[$firstsingleletterword - 1] .= "'"; + } elseif ( $firstmultiletterword > -1 ) { + // If not, but there's a multi-letter word, use that one. + $arr[$firstmultiletterword] = "''"; + $arr[$firstmultiletterword - 1] .= "'"; + } elseif ( $firstspace > -1 ) { + // ... otherwise use the first one that has neither. + // (notice that it is possible for all three to be -1 if, for example, + // there is only one pentuple-apostrophe in the line) + $arr[$firstspace] = "''"; + $arr[$firstspace - 1] .= "'"; + } + } + + // Now let's actually convert our apostrophic mush to HTML! + $output = ''; + $buffer = ''; + $state = ''; + $i = 0; + foreach ( $arr as $r ) { + if ( ( $i % 2 ) == 0 ) { + if ( $state === 'both' ) { + $buffer .= $r; + } else { + $output .= $r; + } + } else { + $thislen = strlen( $r ); + if ( $thislen == 2 ) { + if ( $state === 'i' ) { + $output .= '</i>'; + $state = ''; + } elseif ( $state === 'bi' ) { + $output .= '</i>'; + $state = 'b'; + } elseif ( $state === 'ib' ) { + $output .= '</b></i><b>'; + $state = 'b'; + } elseif ( $state === 'both' ) { + $output .= '<b><i>' . $buffer . '</i>'; + $state = 'b'; + } else { // $state can be 'b' or '' + $output .= '<i>'; + $state .= 'i'; + } + } elseif ( $thislen == 3 ) { + if ( $state === 'b' ) { + $output .= '</b>'; + $state = ''; + } elseif ( $state === 'bi' ) { + $output .= '</i></b><i>'; + $state = 'i'; + } elseif ( $state === 'ib' ) { + $output .= '</b>'; + $state = 'i'; + } elseif ( $state === 'both' ) { + $output .= '<i><b>' . $buffer . '</b>'; + $state = 'i'; + } else { // $state can be 'i' or '' + $output .= '<b>'; + $state .= 'b'; + } + } elseif ( $thislen == 5 ) { + if ( $state === 'b' ) { + $output .= '</b><i>'; + $state = 'i'; + } elseif ( $state === 'i' ) { + $output .= '</i><b>'; + $state = 'b'; + } elseif ( $state === 'bi' ) { + $output .= '</i></b>'; + $state = ''; + } elseif ( $state === 'ib' ) { + $output .= '</b></i>'; + $state = ''; + } elseif ( $state === 'both' ) { + $output .= '<i><b>' . $buffer . '</b></i>'; + $state = ''; + } else { // ($state == '') + $buffer = ''; + $state = 'both'; + } + } + } + $i++; + } + // Now close all remaining tags. Notice that the order is important. + if ( $state === 'b' || $state === 'ib' ) { + $output .= '</b>'; + } + if ( $state === 'i' || $state === 'bi' || $state === 'ib' ) { + $output .= '</i>'; + } + if ( $state === 'bi' ) { + $output .= '</b>'; + } + // There might be lonely ''''', so make sure we have a buffer + if ( $state === 'both' && $buffer ) { + $output .= '<b><i>' . $buffer . '</i></b>'; + } + return $output; + } + + /** + * Replace external links (REL) + * + * Note: this is all very hackish and the order of execution matters a lot. + * Make sure to run tests/parser/parserTests.php if you change this code. + * + * @private + * + * @param string $text + * + * @throws MWException + * @return string + */ + public function replaceExternalLinks( $text ) { + $bits = preg_split( $this->mExtLinkBracketedRegex, $text, -1, PREG_SPLIT_DELIM_CAPTURE ); + if ( $bits === false ) { + throw new MWException( "PCRE needs to be compiled with " + . "--enable-unicode-properties in order for MediaWiki to function" ); + } + $s = array_shift( $bits ); + + $i = 0; + while ( $i < count( $bits ) ) { + $url = $bits[$i++]; + $i++; // protocol + $text = $bits[$i++]; + $trail = $bits[$i++]; + + # The characters '<' and '>' (which were escaped by + # removeHTMLtags()) should not be included in + # URLs, per RFC 2396. + $m2 = []; + if ( preg_match( '/&(lt|gt);/', $url, $m2, PREG_OFFSET_CAPTURE ) ) { + $text = substr( $url, $m2[0][1] ) . ' ' . $text; + $url = substr( $url, 0, $m2[0][1] ); + } + + # If the link text is an image URL, replace it with an <img> tag + # This happened by accident in the original parser, but some people used it extensively + $img = $this->maybeMakeExternalImage( $text ); + if ( $img !== false ) { + $text = $img; + } + + $dtrail = ''; + + # Set linktype for CSS + $linktype = 'text'; + + # No link text, e.g. [http://domain.tld/some.link] + if ( $text == '' ) { + # Autonumber + $langObj = $this->getTargetLanguage(); + $text = '[' . $langObj->formatNum( ++$this->mAutonumber ) . ']'; + $linktype = 'autonumber'; + } else { + # Have link text, e.g. [http://domain.tld/some.link text]s + # Check for trail + list( $dtrail, $trail ) = Linker::splitTrail( $trail ); + } + + $text = $this->getConverterLanguage()->markNoConversion( $text ); + + $url = Sanitizer::cleanUrl( $url ); + + # Use the encoded URL + # This means that users can paste URLs directly into the text + # Funny characters like ö aren't valid in URLs anyway + # This was changed in August 2004 + $s .= Linker::makeExternalLink( $url, $text, false, $linktype, + $this->getExternalLinkAttribs( $url ), $this->mTitle ) . $dtrail . $trail; + + # Register link in the output object. + $this->mOutput->addExternalLink( $url ); + } + + return $s; + } + + /** + * Get the rel attribute for a particular external link. + * + * @since 1.21 + * @param string|bool $url Optional URL, to extract the domain from for rel => + * nofollow if appropriate + * @param Title $title Optional Title, for wgNoFollowNsExceptions lookups + * @return string|null Rel attribute for $url + */ + public static function getExternalLinkRel( $url = false, $title = null ) { + global $wgNoFollowLinks, $wgNoFollowNsExceptions, $wgNoFollowDomainExceptions; + $ns = $title ? $title->getNamespace() : false; + if ( $wgNoFollowLinks && !in_array( $ns, $wgNoFollowNsExceptions ) + && !wfMatchesDomainList( $url, $wgNoFollowDomainExceptions ) + ) { + return 'nofollow'; + } + return null; + } + + /** + * Get an associative array of additional HTML attributes appropriate for a + * particular external link. This currently may include rel => nofollow + * (depending on configuration, namespace, and the URL's domain) and/or a + * target attribute (depending on configuration). + * + * @param string $url URL to extract the domain from for rel => + * nofollow if appropriate + * @return array Associative array of HTML attributes + */ + public function getExternalLinkAttribs( $url ) { + $attribs = []; + $rel = self::getExternalLinkRel( $url, $this->mTitle ); + + $target = $this->mOptions->getExternalLinkTarget(); + if ( $target ) { + $attribs['target'] = $target; + if ( !in_array( $target, [ '_self', '_parent', '_top' ] ) ) { + // T133507. New windows can navigate parent cross-origin. + // Including noreferrer due to lacking browser + // support of noopener. Eventually noreferrer should be removed. + if ( $rel !== '' ) { + $rel .= ' '; + } + $rel .= 'noreferrer noopener'; + } + } + $attribs['rel'] = $rel; + return $attribs; + } + + /** + * Replace unusual escape codes in a URL with their equivalent characters + * + * This generally follows the syntax defined in RFC 3986, with special + * consideration for HTTP query strings. + * + * @param string $url + * @return string + */ + public static function normalizeLinkUrl( $url ) { + # First, make sure unsafe characters are encoded + $url = preg_replace_callback( '/[\x00-\x20"<>\[\\\\\]^`{|}\x7F-\xFF]/', + function ( $m ) { + return rawurlencode( $m[0] ); + }, + $url + ); + + $ret = ''; + $end = strlen( $url ); + + # Fragment part - 'fragment' + $start = strpos( $url, '#' ); + if ( $start !== false && $start < $end ) { + $ret = self::normalizeUrlComponent( + substr( $url, $start, $end - $start ), '"#%<>[\]^`{|}' ) . $ret; + $end = $start; + } + + # Query part - 'query' minus &=+; + $start = strpos( $url, '?' ); + if ( $start !== false && $start < $end ) { + $ret = self::normalizeUrlComponent( + substr( $url, $start, $end - $start ), '"#%<>[\]^`{|}&=+;' ) . $ret; + $end = $start; + } + + # Scheme and path part - 'pchar' + # (we assume no userinfo or encoded colons in the host) + $ret = self::normalizeUrlComponent( + substr( $url, 0, $end ), '"#%<>[\]^`{|}/?' ) . $ret; + + return $ret; + } + + private static function normalizeUrlComponent( $component, $unsafe ) { + $callback = function ( $matches ) use ( $unsafe ) { + $char = urldecode( $matches[0] ); + $ord = ord( $char ); + if ( $ord > 32 && $ord < 127 && strpos( $unsafe, $char ) === false ) { + # Unescape it + return $char; + } else { + # Leave it escaped, but use uppercase for a-f + return strtoupper( $matches[0] ); + } + }; + return preg_replace_callback( '/%[0-9A-Fa-f]{2}/', $callback, $component ); + } + + /** + * make an image if it's allowed, either through the global + * option, through the exception, or through the on-wiki whitelist + * + * @param string $url + * + * @return string + */ + private function maybeMakeExternalImage( $url ) { + $imagesfrom = $this->mOptions->getAllowExternalImagesFrom(); + $imagesexception = !empty( $imagesfrom ); + $text = false; + # $imagesfrom could be either a single string or an array of strings, parse out the latter + if ( $imagesexception && is_array( $imagesfrom ) ) { + $imagematch = false; + foreach ( $imagesfrom as $match ) { + if ( strpos( $url, $match ) === 0 ) { + $imagematch = true; + break; + } + } + } elseif ( $imagesexception ) { + $imagematch = ( strpos( $url, $imagesfrom ) === 0 ); + } else { + $imagematch = false; + } + + if ( $this->mOptions->getAllowExternalImages() + || ( $imagesexception && $imagematch ) + ) { + if ( preg_match( self::EXT_IMAGE_REGEX, $url ) ) { + # Image found + $text = Linker::makeExternalImage( $url ); + } + } + if ( !$text && $this->mOptions->getEnableImageWhitelist() + && preg_match( self::EXT_IMAGE_REGEX, $url ) + ) { + $whitelist = explode( + "\n", + wfMessage( 'external_image_whitelist' )->inContentLanguage()->text() + ); + + foreach ( $whitelist as $entry ) { + # Sanitize the regex fragment, make it case-insensitive, ignore blank entries/comments + if ( strpos( $entry, '#' ) === 0 || $entry === '' ) { + continue; + } + if ( preg_match( '/' . str_replace( '/', '\\/', $entry ) . '/i', $url ) ) { + # Image matches a whitelist entry + $text = Linker::makeExternalImage( $url ); + break; + } + } + } + return $text; + } + + /** + * Process [[ ]] wikilinks + * + * @param string $s + * + * @return string Processed text + * + * @private + */ + public function replaceInternalLinks( $s ) { + $this->mLinkHolders->merge( $this->replaceInternalLinks2( $s ) ); + return $s; + } + + /** + * Process [[ ]] wikilinks (RIL) + * @param string &$s + * @throws MWException + * @return LinkHolderArray + * + * @private + */ + public function replaceInternalLinks2( &$s ) { + global $wgExtraInterlanguageLinkPrefixes; + + static $tc = false, $e1, $e1_img; + # the % is needed to support urlencoded titles as well + if ( !$tc ) { + $tc = Title::legalChars() . '#%'; + # Match a link having the form [[namespace:link|alternate]]trail + $e1 = "/^([{$tc}]+)(?:\\|(.+?))?]](.*)\$/sD"; + # Match cases where there is no "]]", which might still be images + $e1_img = "/^([{$tc}]+)\\|(.*)\$/sD"; + } + + $holders = new LinkHolderArray( $this ); + + # split the entire text string on occurrences of [[ + $a = StringUtils::explode( '[[', ' ' . $s ); + # get the first element (all text up to first [[), and remove the space we added + $s = $a->current(); + $a->next(); + $line = $a->current(); # Workaround for broken ArrayIterator::next() that returns "void" + $s = substr( $s, 1 ); + + $useLinkPrefixExtension = $this->getTargetLanguage()->linkPrefixExtension(); + $e2 = null; + if ( $useLinkPrefixExtension ) { + # Match the end of a line for a word that's not followed by whitespace, + # e.g. in the case of 'The Arab al[[Razi]]', 'al' will be matched + global $wgContLang; + $charset = $wgContLang->linkPrefixCharset(); + $e2 = "/^((?>.*[^$charset]|))(.+)$/sDu"; + } + + if ( is_null( $this->mTitle ) ) { + throw new MWException( __METHOD__ . ": \$this->mTitle is null\n" ); + } + $nottalk = !$this->mTitle->isTalkPage(); + + if ( $useLinkPrefixExtension ) { + $m = []; + if ( preg_match( $e2, $s, $m ) ) { + $first_prefix = $m[2]; + } else { + $first_prefix = false; + } + } else { + $prefix = ''; + } + + $useSubpages = $this->areSubpagesAllowed(); + + # Loop for each link + for ( ; $line !== false && $line !== null; $a->next(), $line = $a->current() ) { + # Check for excessive memory usage + if ( $holders->isBig() ) { + # Too big + # Do the existence check, replace the link holders and clear the array + $holders->replace( $s ); + $holders->clear(); + } + + if ( $useLinkPrefixExtension ) { + if ( preg_match( $e2, $s, $m ) ) { + $prefix = $m[2]; + $s = $m[1]; + } else { + $prefix = ''; + } + # first link + if ( $first_prefix ) { + $prefix = $first_prefix; + $first_prefix = false; + } + } + + $might_be_img = false; + + if ( preg_match( $e1, $line, $m ) ) { # page with normal text or alt + $text = $m[2]; + # If we get a ] at the beginning of $m[3] that means we have a link that's something like: + # [[Image:Foo.jpg|[http://example.com desc]]] <- having three ] in a row fucks up, + # the real problem is with the $e1 regex + # See T1500. + # Still some problems for cases where the ] is meant to be outside punctuation, + # and no image is in sight. See T4095. + if ( $text !== '' + && substr( $m[3], 0, 1 ) === ']' + && strpos( $text, '[' ) !== false + ) { + $text .= ']'; # so that replaceExternalLinks($text) works later + $m[3] = substr( $m[3], 1 ); + } + # fix up urlencoded title texts + if ( strpos( $m[1], '%' ) !== false ) { + # Should anchors '#' also be rejected? + $m[1] = str_replace( [ '<', '>' ], [ '<', '>' ], rawurldecode( $m[1] ) ); + } + $trail = $m[3]; + } elseif ( preg_match( $e1_img, $line, $m ) ) { + # Invalid, but might be an image with a link in its caption + $might_be_img = true; + $text = $m[2]; + if ( strpos( $m[1], '%' ) !== false ) { + $m[1] = str_replace( [ '<', '>' ], [ '<', '>' ], rawurldecode( $m[1] ) ); + } + $trail = ""; + } else { # Invalid form; output directly + $s .= $prefix . '[[' . $line; + continue; + } + + $origLink = ltrim( $m[1], ' ' ); + + # Don't allow internal links to pages containing + # PROTO: where PROTO is a valid URL protocol; these + # should be external links. + if ( preg_match( '/^(?i:' . $this->mUrlProtocols . ')/', $origLink ) ) { + $s .= $prefix . '[[' . $line; + continue; + } + + # Make subpage if necessary + if ( $useSubpages ) { + $link = $this->maybeDoSubpageLink( $origLink, $text ); + } else { + $link = $origLink; + } + + // \x7f isn't a default legal title char, so most likely strip + // markers will force us into the "invalid form" path above. But, + // just in case, let's assert that xmlish tags aren't valid in + // the title position. + $unstrip = $this->mStripState->killMarkers( $link ); + $noMarkers = ( $unstrip === $link ); + + $nt = $noMarkers ? Title::newFromText( $link ) : null; + if ( $nt === null ) { + $s .= $prefix . '[[' . $line; + continue; + } + + $ns = $nt->getNamespace(); + $iw = $nt->getInterwiki(); + + $noforce = ( substr( $origLink, 0, 1 ) !== ':' ); + + if ( $might_be_img ) { # if this is actually an invalid link + if ( $ns == NS_FILE && $noforce ) { # but might be an image + $found = false; + while ( true ) { + # look at the next 'line' to see if we can close it there + $a->next(); + $next_line = $a->current(); + if ( $next_line === false || $next_line === null ) { + break; + } + $m = explode( ']]', $next_line, 3 ); + if ( count( $m ) == 3 ) { + # the first ]] closes the inner link, the second the image + $found = true; + $text .= "[[{$m[0]}]]{$m[1]}"; + $trail = $m[2]; + break; + } elseif ( count( $m ) == 2 ) { + # if there's exactly one ]] that's fine, we'll keep looking + $text .= "[[{$m[0]}]]{$m[1]}"; + } else { + # if $next_line is invalid too, we need look no further + $text .= '[[' . $next_line; + break; + } + } + if ( !$found ) { + # we couldn't find the end of this imageLink, so output it raw + # but don't ignore what might be perfectly normal links in the text we've examined + $holders->merge( $this->replaceInternalLinks2( $text ) ); + $s .= "{$prefix}[[$link|$text"; + # note: no $trail, because without an end, there *is* no trail + continue; + } + } else { # it's not an image, so output it raw + $s .= "{$prefix}[[$link|$text"; + # note: no $trail, because without an end, there *is* no trail + continue; + } + } + + $wasblank = ( $text == '' ); + if ( $wasblank ) { + $text = $link; + if ( !$noforce ) { + # Strip off leading ':' + $text = substr( $text, 1 ); + } + } else { + # T6598 madness. Handle the quotes only if they come from the alternate part + # [[Lista d''e paise d''o munno]] -> <a href="...">Lista d''e paise d''o munno</a> + # [[Criticism of Harry Potter|Criticism of ''Harry Potter'']] + # -> <a href="Criticism of Harry Potter">Criticism of <i>Harry Potter</i></a> + $text = $this->doQuotes( $text ); + } + + # Link not escaped by : , create the various objects + if ( $noforce && !$nt->wasLocalInterwiki() ) { + # Interwikis + if ( + $iw && $this->mOptions->getInterwikiMagic() && $nottalk && ( + Language::fetchLanguageName( $iw, null, 'mw' ) || + in_array( $iw, $wgExtraInterlanguageLinkPrefixes ) + ) + ) { + # T26502: filter duplicates + if ( !isset( $this->mLangLinkLanguages[$iw] ) ) { + $this->mLangLinkLanguages[$iw] = true; + $this->mOutput->addLanguageLink( $nt->getFullText() ); + } + + /** + * Strip the whitespace interwiki links produce, see T10897 + */ + $s = rtrim( $s . $prefix ) . $trail; # T175416 + continue; + } + + if ( $ns == NS_FILE ) { + if ( !wfIsBadImage( $nt->getDBkey(), $this->mTitle ) ) { + if ( $wasblank ) { + # if no parameters were passed, $text + # becomes something like "File:Foo.png", + # which we don't want to pass on to the + # image generator + $text = ''; + } else { + # recursively parse links inside the image caption + # actually, this will parse them in any other parameters, too, + # but it might be hard to fix that, and it doesn't matter ATM + $text = $this->replaceExternalLinks( $text ); + $holders->merge( $this->replaceInternalLinks2( $text ) ); + } + # cloak any absolute URLs inside the image markup, so replaceExternalLinks() won't touch them + $s .= $prefix . $this->armorLinks( + $this->makeImage( $nt, $text, $holders ) ) . $trail; + continue; + } + } elseif ( $ns == NS_CATEGORY ) { + /** + * Strip the whitespace Category links produce, see T2087 + */ + $s = rtrim( $s . $prefix ) . $trail; # T2087, T87753 + + if ( $wasblank ) { + $sortkey = $this->getDefaultSort(); + } else { + $sortkey = $text; + } + $sortkey = Sanitizer::decodeCharReferences( $sortkey ); + $sortkey = str_replace( "\n", '', $sortkey ); + $sortkey = $this->getConverterLanguage()->convertCategoryKey( $sortkey ); + $this->mOutput->addCategory( $nt->getDBkey(), $sortkey ); + + continue; + } + } + + # Self-link checking. For some languages, variants of the title are checked in + # LinkHolderArray::doVariants() to allow batching the existence checks necessary + # for linking to a different variant. + if ( $ns != NS_SPECIAL && $nt->equals( $this->mTitle ) && !$nt->hasFragment() ) { + $s .= $prefix . Linker::makeSelfLinkObj( $nt, $text, '', $trail ); + continue; + } + + # NS_MEDIA is a pseudo-namespace for linking directly to a file + # @todo FIXME: Should do batch file existence checks, see comment below + if ( $ns == NS_MEDIA ) { + # Give extensions a chance to select the file revision for us + $options = []; + $descQuery = false; + Hooks::run( 'BeforeParserFetchFileAndTitle', + [ $this, $nt, &$options, &$descQuery ] ); + # Fetch and register the file (file title may be different via hooks) + list( $file, $nt ) = $this->fetchFileAndTitle( $nt, $options ); + # Cloak with NOPARSE to avoid replacement in replaceExternalLinks + $s .= $prefix . $this->armorLinks( + Linker::makeMediaLinkFile( $nt, $file, $text ) ) . $trail; + continue; + } + + # Some titles, such as valid special pages or files in foreign repos, should + # be shown as bluelinks even though they're not included in the page table + # @todo FIXME: isAlwaysKnown() can be expensive for file links; we should really do + # batch file existence checks for NS_FILE and NS_MEDIA + if ( $iw == '' && $nt->isAlwaysKnown() ) { + $this->mOutput->addLink( $nt ); + $s .= $this->makeKnownLinkHolder( $nt, $text, $trail, $prefix ); + } else { + # Links will be added to the output link list after checking + $s .= $holders->makeHolder( $nt, $text, [], $trail, $prefix ); + } + } + return $holders; + } + + /** + * Render a forced-blue link inline; protect against double expansion of + * URLs if we're in a mode that prepends full URL prefixes to internal links. + * Since this little disaster has to split off the trail text to avoid + * breaking URLs in the following text without breaking trails on the + * wiki links, it's been made into a horrible function. + * + * @param Title $nt + * @param string $text + * @param string $trail + * @param string $prefix + * @return string HTML-wikitext mix oh yuck + */ + protected function makeKnownLinkHolder( $nt, $text = '', $trail = '', $prefix = '' ) { + list( $inside, $trail ) = Linker::splitTrail( $trail ); + + if ( $text == '' ) { + $text = htmlspecialchars( $nt->getPrefixedText() ); + } + + $link = $this->getLinkRenderer()->makeKnownLink( + $nt, new HtmlArmor( "$prefix$text$inside" ) + ); + + return $this->armorLinks( $link ) . $trail; + } + + /** + * Insert a NOPARSE hacky thing into any inline links in a chunk that's + * going to go through further parsing steps before inline URL expansion. + * + * Not needed quite as much as it used to be since free links are a bit + * more sensible these days. But bracketed links are still an issue. + * + * @param string $text More-or-less HTML + * @return string Less-or-more HTML with NOPARSE bits + */ + public function armorLinks( $text ) { + return preg_replace( '/\b((?i)' . $this->mUrlProtocols . ')/', + self::MARKER_PREFIX . "NOPARSE$1", $text ); + } + + /** + * Return true if subpage links should be expanded on this page. + * @return bool + */ + public function areSubpagesAllowed() { + # Some namespaces don't allow subpages + return MWNamespace::hasSubpages( $this->mTitle->getNamespace() ); + } + + /** + * Handle link to subpage if necessary + * + * @param string $target The source of the link + * @param string &$text The link text, modified as necessary + * @return string The full name of the link + * @private + */ + public function maybeDoSubpageLink( $target, &$text ) { + return Linker::normalizeSubpageLink( $this->mTitle, $target, $text ); + } + + /** + * Make lists from lines starting with ':', '*', '#', etc. (DBL) + * + * @param string $text + * @param bool $linestart Whether or not this is at the start of a line. + * @private + * @return string The lists rendered as HTML + */ + public function doBlockLevels( $text, $linestart ) { + return BlockLevelPass::doBlockLevels( $text, $linestart ); + } + + /** + * Return value of a magic variable (like PAGENAME) + * + * @private + * + * @param string $index Magic variable identifier as mapped in MagicWord::$mVariableIDs + * @param bool|PPFrame $frame + * + * @throws MWException + * @return string + */ + public function getVariableValue( $index, $frame = false ) { + global $wgContLang, $wgSitename, $wgServer, $wgServerName; + global $wgArticlePath, $wgScriptPath, $wgStylePath; + + if ( is_null( $this->mTitle ) ) { + // If no title set, bad things are going to happen + // later. Title should always be set since this + // should only be called in the middle of a parse + // operation (but the unit-tests do funky stuff) + throw new MWException( __METHOD__ . ' Should only be ' + . ' called while parsing (no title set)' ); + } + + // Avoid PHP 7.1 warning from passing $this by reference + $parser = $this; + + /** + * Some of these require message or data lookups and can be + * expensive to check many times. + */ + if ( Hooks::run( 'ParserGetVariableValueVarCache', [ &$parser, &$this->mVarCache ] ) ) { + if ( isset( $this->mVarCache[$index] ) ) { + return $this->mVarCache[$index]; + } + } + + $ts = wfTimestamp( TS_UNIX, $this->mOptions->getTimestamp() ); + Hooks::run( 'ParserGetVariableValueTs', [ &$parser, &$ts ] ); + + $pageLang = $this->getFunctionLang(); + + switch ( $index ) { + case '!': + $value = '|'; + break; + case 'currentmonth': + $value = $pageLang->formatNum( MWTimestamp::getInstance( $ts )->format( 'm' ), true ); + break; + case 'currentmonth1': + $value = $pageLang->formatNum( MWTimestamp::getInstance( $ts )->format( 'n' ), true ); + break; + case 'currentmonthname': + $value = $pageLang->getMonthName( MWTimestamp::getInstance( $ts )->format( 'n' ) ); + break; + case 'currentmonthnamegen': + $value = $pageLang->getMonthNameGen( MWTimestamp::getInstance( $ts )->format( 'n' ) ); + break; + case 'currentmonthabbrev': + $value = $pageLang->getMonthAbbreviation( MWTimestamp::getInstance( $ts )->format( 'n' ) ); + break; + case 'currentday': + $value = $pageLang->formatNum( MWTimestamp::getInstance( $ts )->format( 'j' ), true ); + break; + case 'currentday2': + $value = $pageLang->formatNum( MWTimestamp::getInstance( $ts )->format( 'd' ), true ); + break; + case 'localmonth': + $value = $pageLang->formatNum( MWTimestamp::getLocalInstance( $ts )->format( 'm' ), true ); + break; + case 'localmonth1': + $value = $pageLang->formatNum( MWTimestamp::getLocalInstance( $ts )->format( 'n' ), true ); + break; + case 'localmonthname': + $value = $pageLang->getMonthName( MWTimestamp::getLocalInstance( $ts )->format( 'n' ) ); + break; + case 'localmonthnamegen': + $value = $pageLang->getMonthNameGen( MWTimestamp::getLocalInstance( $ts )->format( 'n' ) ); + break; + case 'localmonthabbrev': + $value = $pageLang->getMonthAbbreviation( MWTimestamp::getLocalInstance( $ts )->format( 'n' ) ); + break; + case 'localday': + $value = $pageLang->formatNum( MWTimestamp::getLocalInstance( $ts )->format( 'j' ), true ); + break; + case 'localday2': + $value = $pageLang->formatNum( MWTimestamp::getLocalInstance( $ts )->format( 'd' ), true ); + break; + case 'pagename': + $value = wfEscapeWikiText( $this->mTitle->getText() ); + break; + case 'pagenamee': + $value = wfEscapeWikiText( $this->mTitle->getPartialURL() ); + break; + case 'fullpagename': + $value = wfEscapeWikiText( $this->mTitle->getPrefixedText() ); + break; + case 'fullpagenamee': + $value = wfEscapeWikiText( $this->mTitle->getPrefixedURL() ); + break; + case 'subpagename': + $value = wfEscapeWikiText( $this->mTitle->getSubpageText() ); + break; + case 'subpagenamee': + $value = wfEscapeWikiText( $this->mTitle->getSubpageUrlForm() ); + break; + case 'rootpagename': + $value = wfEscapeWikiText( $this->mTitle->getRootText() ); + break; + case 'rootpagenamee': + $value = wfEscapeWikiText( wfUrlencode( str_replace( + ' ', + '_', + $this->mTitle->getRootText() + ) ) ); + break; + case 'basepagename': + $value = wfEscapeWikiText( $this->mTitle->getBaseText() ); + break; + case 'basepagenamee': + $value = wfEscapeWikiText( wfUrlencode( str_replace( + ' ', + '_', + $this->mTitle->getBaseText() + ) ) ); + break; + case 'talkpagename': + if ( $this->mTitle->canHaveTalkPage() ) { + $talkPage = $this->mTitle->getTalkPage(); + $value = wfEscapeWikiText( $talkPage->getPrefixedText() ); + } else { + $value = ''; + } + break; + case 'talkpagenamee': + if ( $this->mTitle->canHaveTalkPage() ) { + $talkPage = $this->mTitle->getTalkPage(); + $value = wfEscapeWikiText( $talkPage->getPrefixedURL() ); + } else { + $value = ''; + } + break; + case 'subjectpagename': + $subjPage = $this->mTitle->getSubjectPage(); + $value = wfEscapeWikiText( $subjPage->getPrefixedText() ); + break; + case 'subjectpagenamee': + $subjPage = $this->mTitle->getSubjectPage(); + $value = wfEscapeWikiText( $subjPage->getPrefixedURL() ); + break; + case 'pageid': // requested in T25427 + $pageid = $this->getTitle()->getArticleID(); + if ( $pageid == 0 ) { + # 0 means the page doesn't exist in the database, + # which means the user is previewing a new page. + # The vary-revision flag must be set, because the magic word + # will have a different value once the page is saved. + $this->mOutput->setFlag( 'vary-revision' ); + wfDebug( __METHOD__ . ": {{PAGEID}} used in a new page, setting vary-revision...\n" ); + } + $value = $pageid ? $pageid : null; + break; + case 'revisionid': + # Let the edit saving system know we should parse the page + # *after* a revision ID has been assigned. + $this->mOutput->setFlag( 'vary-revision-id' ); + wfDebug( __METHOD__ . ": {{REVISIONID}} used, setting vary-revision-id...\n" ); + $value = $this->mRevisionId; + if ( !$value && $this->mOptions->getSpeculativeRevIdCallback() ) { + $value = call_user_func( $this->mOptions->getSpeculativeRevIdCallback() ); + $this->mOutput->setSpeculativeRevIdUsed( $value ); + } + break; + case 'revisionday': + # Let the edit saving system know we should parse the page + # *after* a revision ID has been assigned. This is for null edits. + $this->mOutput->setFlag( 'vary-revision' ); + wfDebug( __METHOD__ . ": {{REVISIONDAY}} used, setting vary-revision...\n" ); + $value = intval( substr( $this->getRevisionTimestamp(), 6, 2 ) ); + break; + case 'revisionday2': + # Let the edit saving system know we should parse the page + # *after* a revision ID has been assigned. This is for null edits. + $this->mOutput->setFlag( 'vary-revision' ); + wfDebug( __METHOD__ . ": {{REVISIONDAY2}} used, setting vary-revision...\n" ); + $value = substr( $this->getRevisionTimestamp(), 6, 2 ); + break; + case 'revisionmonth': + # Let the edit saving system know we should parse the page + # *after* a revision ID has been assigned. This is for null edits. + $this->mOutput->setFlag( 'vary-revision' ); + wfDebug( __METHOD__ . ": {{REVISIONMONTH}} used, setting vary-revision...\n" ); + $value = substr( $this->getRevisionTimestamp(), 4, 2 ); + break; + case 'revisionmonth1': + # Let the edit saving system know we should parse the page + # *after* a revision ID has been assigned. This is for null edits. + $this->mOutput->setFlag( 'vary-revision' ); + wfDebug( __METHOD__ . ": {{REVISIONMONTH1}} used, setting vary-revision...\n" ); + $value = intval( substr( $this->getRevisionTimestamp(), 4, 2 ) ); + break; + case 'revisionyear': + # Let the edit saving system know we should parse the page + # *after* a revision ID has been assigned. This is for null edits. + $this->mOutput->setFlag( 'vary-revision' ); + wfDebug( __METHOD__ . ": {{REVISIONYEAR}} used, setting vary-revision...\n" ); + $value = substr( $this->getRevisionTimestamp(), 0, 4 ); + break; + case 'revisiontimestamp': + # Let the edit saving system know we should parse the page + # *after* a revision ID has been assigned. This is for null edits. + $this->mOutput->setFlag( 'vary-revision' ); + wfDebug( __METHOD__ . ": {{REVISIONTIMESTAMP}} used, setting vary-revision...\n" ); + $value = $this->getRevisionTimestamp(); + break; + case 'revisionuser': + # Let the edit saving system know we should parse the page + # *after* a revision ID has been assigned for null edits. + $this->mOutput->setFlag( 'vary-user' ); + wfDebug( __METHOD__ . ": {{REVISIONUSER}} used, setting vary-user...\n" ); + $value = $this->getRevisionUser(); + break; + case 'revisionsize': + $value = $this->getRevisionSize(); + break; + case 'namespace': + $value = str_replace( '_', ' ', $wgContLang->getNsText( $this->mTitle->getNamespace() ) ); + break; + case 'namespacee': + $value = wfUrlencode( $wgContLang->getNsText( $this->mTitle->getNamespace() ) ); + break; + case 'namespacenumber': + $value = $this->mTitle->getNamespace(); + break; + case 'talkspace': + $value = $this->mTitle->canHaveTalkPage() + ? str_replace( '_', ' ', $this->mTitle->getTalkNsText() ) + : ''; + break; + case 'talkspacee': + $value = $this->mTitle->canHaveTalkPage() ? wfUrlencode( $this->mTitle->getTalkNsText() ) : ''; + break; + case 'subjectspace': + $value = str_replace( '_', ' ', $this->mTitle->getSubjectNsText() ); + break; + case 'subjectspacee': + $value = ( wfUrlencode( $this->mTitle->getSubjectNsText() ) ); + break; + case 'currentdayname': + $value = $pageLang->getWeekdayName( (int)MWTimestamp::getInstance( $ts )->format( 'w' ) + 1 ); + break; + case 'currentyear': + $value = $pageLang->formatNum( MWTimestamp::getInstance( $ts )->format( 'Y' ), true ); + break; + case 'currenttime': + $value = $pageLang->time( wfTimestamp( TS_MW, $ts ), false, false ); + break; + case 'currenthour': + $value = $pageLang->formatNum( MWTimestamp::getInstance( $ts )->format( 'H' ), true ); + break; + case 'currentweek': + # @bug T6594 PHP5 has it zero padded, PHP4 does not, cast to + # int to remove the padding + $value = $pageLang->formatNum( (int)MWTimestamp::getInstance( $ts )->format( 'W' ) ); + break; + case 'currentdow': + $value = $pageLang->formatNum( MWTimestamp::getInstance( $ts )->format( 'w' ) ); + break; + case 'localdayname': + $value = $pageLang->getWeekdayName( + (int)MWTimestamp::getLocalInstance( $ts )->format( 'w' ) + 1 + ); + break; + case 'localyear': + $value = $pageLang->formatNum( MWTimestamp::getLocalInstance( $ts )->format( 'Y' ), true ); + break; + case 'localtime': + $value = $pageLang->time( + MWTimestamp::getLocalInstance( $ts )->format( 'YmdHis' ), + false, + false + ); + break; + case 'localhour': + $value = $pageLang->formatNum( MWTimestamp::getLocalInstance( $ts )->format( 'H' ), true ); + break; + case 'localweek': + # @bug T6594 PHP5 has it zero padded, PHP4 does not, cast to + # int to remove the padding + $value = $pageLang->formatNum( (int)MWTimestamp::getLocalInstance( $ts )->format( 'W' ) ); + break; + case 'localdow': + $value = $pageLang->formatNum( MWTimestamp::getLocalInstance( $ts )->format( 'w' ) ); + break; + case 'numberofarticles': + $value = $pageLang->formatNum( SiteStats::articles() ); + break; + case 'numberoffiles': + $value = $pageLang->formatNum( SiteStats::images() ); + break; + case 'numberofusers': + $value = $pageLang->formatNum( SiteStats::users() ); + break; + case 'numberofactiveusers': + $value = $pageLang->formatNum( SiteStats::activeUsers() ); + break; + case 'numberofpages': + $value = $pageLang->formatNum( SiteStats::pages() ); + break; + case 'numberofadmins': + $value = $pageLang->formatNum( SiteStats::numberingroup( 'sysop' ) ); + break; + case 'numberofedits': + $value = $pageLang->formatNum( SiteStats::edits() ); + break; + case 'currenttimestamp': + $value = wfTimestamp( TS_MW, $ts ); + break; + case 'localtimestamp': + $value = MWTimestamp::getLocalInstance( $ts )->format( 'YmdHis' ); + break; + case 'currentversion': + $value = SpecialVersion::getVersion(); + break; + case 'articlepath': + return $wgArticlePath; + case 'sitename': + return $wgSitename; + case 'server': + return $wgServer; + case 'servername': + return $wgServerName; + case 'scriptpath': + return $wgScriptPath; + case 'stylepath': + return $wgStylePath; + case 'directionmark': + return $pageLang->getDirMark(); + case 'contentlanguage': + global $wgLanguageCode; + return $wgLanguageCode; + case 'pagelanguage': + $value = $pageLang->getCode(); + break; + case 'cascadingsources': + $value = CoreParserFunctions::cascadingsources( $this ); + break; + default: + $ret = null; + Hooks::run( + 'ParserGetVariableValueSwitch', + [ &$parser, &$this->mVarCache, &$index, &$ret, &$frame ] + ); + + return $ret; + } + + if ( $index ) { + $this->mVarCache[$index] = $value; + } + + return $value; + } + + /** + * initialise the magic variables (like CURRENTMONTHNAME) and substitution modifiers + * + * @private + */ + public function initialiseVariables() { + $variableIDs = MagicWord::getVariableIDs(); + $substIDs = MagicWord::getSubstIDs(); + + $this->mVariables = new MagicWordArray( $variableIDs ); + $this->mSubstWords = new MagicWordArray( $substIDs ); + } + + /** + * Preprocess some wikitext and return the document tree. + * This is the ghost of replace_variables(). + * + * @param string $text The text to parse + * @param int $flags Bitwise combination of: + * - self::PTD_FOR_INCLUSION: Handle "<noinclude>" and "<includeonly>" as if the text is being + * included. Default is to assume a direct page view. + * + * The generated DOM tree must depend only on the input text and the flags. + * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of T6899. + * + * Any flag added to the $flags parameter here, or any other parameter liable to cause a + * change in the DOM tree for a given text, must be passed through the section identifier + * in the section edit link and thus back to extractSections(). + * + * The output of this function is currently only cached in process memory, but a persistent + * cache may be implemented at a later date which takes further advantage of these strict + * dependency requirements. + * + * @return PPNode + */ + public function preprocessToDom( $text, $flags = 0 ) { + $dom = $this->getPreprocessor()->preprocessToObj( $text, $flags ); + return $dom; + } + + /** + * Return a three-element array: leading whitespace, string contents, trailing whitespace + * + * @param string $s + * + * @return array + */ + public static function splitWhitespace( $s ) { + $ltrimmed = ltrim( $s ); + $w1 = substr( $s, 0, strlen( $s ) - strlen( $ltrimmed ) ); + $trimmed = rtrim( $ltrimmed ); + $diff = strlen( $ltrimmed ) - strlen( $trimmed ); + if ( $diff > 0 ) { + $w2 = substr( $ltrimmed, -$diff ); + } else { + $w2 = ''; + } + return [ $w1, $trimmed, $w2 ]; + } + + /** + * Replace magic variables, templates, and template arguments + * with the appropriate text. Templates are substituted recursively, + * taking care to avoid infinite loops. + * + * Note that the substitution depends on value of $mOutputType: + * self::OT_WIKI: only {{subst:}} templates + * self::OT_PREPROCESS: templates but not extension tags + * self::OT_HTML: all templates and extension tags + * + * @param string $text The text to transform + * @param bool|PPFrame $frame Object describing the arguments passed to the + * template. Arguments may also be provided as an associative array, as + * was the usual case before MW1.12. Providing arguments this way may be + * useful for extensions wishing to perform variable replacement + * explicitly. + * @param bool $argsOnly Only do argument (triple-brace) expansion, not + * double-brace expansion. + * @return string + */ + public function replaceVariables( $text, $frame = false, $argsOnly = false ) { + # Is there any text? Also, Prevent too big inclusions! + $textSize = strlen( $text ); + if ( $textSize < 1 || $textSize > $this->mOptions->getMaxIncludeSize() ) { + return $text; + } + + if ( $frame === false ) { + $frame = $this->getPreprocessor()->newFrame(); + } elseif ( !( $frame instanceof PPFrame ) ) { + wfDebug( __METHOD__ . " called using plain parameters instead of " + . "a PPFrame instance. Creating custom frame.\n" ); + $frame = $this->getPreprocessor()->newCustomFrame( $frame ); + } + + $dom = $this->preprocessToDom( $text ); + $flags = $argsOnly ? PPFrame::NO_TEMPLATES : 0; + $text = $frame->expand( $dom, $flags ); + + return $text; + } + + /** + * Clean up argument array - refactored in 1.9 so parserfunctions can use it, too. + * + * @param array $args + * + * @return array + */ + public static function createAssocArgs( $args ) { + $assocArgs = []; + $index = 1; + foreach ( $args as $arg ) { + $eqpos = strpos( $arg, '=' ); + if ( $eqpos === false ) { + $assocArgs[$index++] = $arg; + } else { + $name = trim( substr( $arg, 0, $eqpos ) ); + $value = trim( substr( $arg, $eqpos + 1 ) ); + if ( $value === false ) { + $value = ''; + } + if ( $name !== false ) { + $assocArgs[$name] = $value; + } + } + } + + return $assocArgs; + } + + /** + * Warn the user when a parser limitation is reached + * Will warn at most once the user per limitation type + * + * The results are shown during preview and run through the Parser (See EditPage.php) + * + * @param string $limitationType Should be one of: + * 'expensive-parserfunction' (corresponding messages: + * 'expensive-parserfunction-warning', + * 'expensive-parserfunction-category') + * 'post-expand-template-argument' (corresponding messages: + * 'post-expand-template-argument-warning', + * 'post-expand-template-argument-category') + * 'post-expand-template-inclusion' (corresponding messages: + * 'post-expand-template-inclusion-warning', + * 'post-expand-template-inclusion-category') + * 'node-count-exceeded' (corresponding messages: + * 'node-count-exceeded-warning', + * 'node-count-exceeded-category') + * 'expansion-depth-exceeded' (corresponding messages: + * 'expansion-depth-exceeded-warning', + * 'expansion-depth-exceeded-category') + * @param string|int|null $current Current value + * @param string|int|null $max Maximum allowed, when an explicit limit has been + * exceeded, provide the values (optional) + */ + public function limitationWarn( $limitationType, $current = '', $max = '' ) { + # does no harm if $current and $max are present but are unnecessary for the message + # Not doing ->inLanguage( $this->mOptions->getUserLangObj() ), since this is shown + # only during preview, and that would split the parser cache unnecessarily. + $warning = wfMessage( "$limitationType-warning" )->numParams( $current, $max ) + ->text(); + $this->mOutput->addWarning( $warning ); + $this->addTrackingCategory( "$limitationType-category" ); + } + + /** + * Return the text of a template, after recursively + * replacing any variables or templates within the template. + * + * @param array $piece The parts of the template + * $piece['title']: the title, i.e. the part before the | + * $piece['parts']: the parameter array + * $piece['lineStart']: whether the brace was at the start of a line + * @param PPFrame $frame The current frame, contains template arguments + * @throws Exception + * @return string The text of the template + */ + public function braceSubstitution( $piece, $frame ) { + // Flags + + // $text has been filled + $found = false; + // wiki markup in $text should be escaped + $nowiki = false; + // $text is HTML, armour it against wikitext transformation + $isHTML = false; + // Force interwiki transclusion to be done in raw mode not rendered + $forceRawInterwiki = false; + // $text is a DOM node needing expansion in a child frame + $isChildObj = false; + // $text is a DOM node needing expansion in the current frame + $isLocalObj = false; + + # Title object, where $text came from + $title = false; + + # $part1 is the bit before the first |, and must contain only title characters. + # Various prefixes will be stripped from it later. + $titleWithSpaces = $frame->expand( $piece['title'] ); + $part1 = trim( $titleWithSpaces ); + $titleText = false; + + # Original title text preserved for various purposes + $originalTitle = $part1; + + # $args is a list of argument nodes, starting from index 0, not including $part1 + # @todo FIXME: If piece['parts'] is null then the call to getLength() + # below won't work b/c this $args isn't an object + $args = ( null == $piece['parts'] ) ? [] : $piece['parts']; + + $profileSection = null; // profile templates + + # SUBST + if ( !$found ) { + $substMatch = $this->mSubstWords->matchStartAndRemove( $part1 ); + + # Possibilities for substMatch: "subst", "safesubst" or FALSE + # Decide whether to expand template or keep wikitext as-is. + if ( $this->ot['wiki'] ) { + if ( $substMatch === false ) { + $literal = true; # literal when in PST with no prefix + } else { + $literal = false; # expand when in PST with subst: or safesubst: + } + } else { + if ( $substMatch == 'subst' ) { + $literal = true; # literal when not in PST with plain subst: + } else { + $literal = false; # expand when not in PST with safesubst: or no prefix + } + } + if ( $literal ) { + $text = $frame->virtualBracketedImplode( '{{', '|', '}}', $titleWithSpaces, $args ); + $isLocalObj = true; + $found = true; + } + } + + # Variables + if ( !$found && $args->getLength() == 0 ) { + $id = $this->mVariables->matchStartToEnd( $part1 ); + if ( $id !== false ) { + $text = $this->getVariableValue( $id, $frame ); + if ( MagicWord::getCacheTTL( $id ) > -1 ) { + $this->mOutput->updateCacheExpiry( MagicWord::getCacheTTL( $id ) ); + } + $found = true; + } + } + + # MSG, MSGNW and RAW + if ( !$found ) { + # Check for MSGNW: + $mwMsgnw = MagicWord::get( 'msgnw' ); + if ( $mwMsgnw->matchStartAndRemove( $part1 ) ) { + $nowiki = true; + } else { + # Remove obsolete MSG: + $mwMsg = MagicWord::get( 'msg' ); + $mwMsg->matchStartAndRemove( $part1 ); + } + + # Check for RAW: + $mwRaw = MagicWord::get( 'raw' ); + if ( $mwRaw->matchStartAndRemove( $part1 ) ) { + $forceRawInterwiki = true; + } + } + + # Parser functions + if ( !$found ) { + $colonPos = strpos( $part1, ':' ); + if ( $colonPos !== false ) { + $func = substr( $part1, 0, $colonPos ); + $funcArgs = [ trim( substr( $part1, $colonPos + 1 ) ) ]; + $argsLength = $args->getLength(); + for ( $i = 0; $i < $argsLength; $i++ ) { + $funcArgs[] = $args->item( $i ); + } + try { + $result = $this->callParserFunction( $frame, $func, $funcArgs ); + } catch ( Exception $ex ) { + throw $ex; + } + + // Extract any forwarded flags + if ( isset( $result['title'] ) ) { + $title = $result['title']; + } + if ( isset( $result['found'] ) ) { + $found = $result['found']; + } + if ( array_key_exists( 'text', $result ) ) { + // a string or null + $text = $result['text']; + } + if ( isset( $result['nowiki'] ) ) { + $nowiki = $result['nowiki']; + } + if ( isset( $result['isHTML'] ) ) { + $isHTML = $result['isHTML']; + } + if ( isset( $result['forceRawInterwiki'] ) ) { + $forceRawInterwiki = $result['forceRawInterwiki']; + } + if ( isset( $result['isChildObj'] ) ) { + $isChildObj = $result['isChildObj']; + } + if ( isset( $result['isLocalObj'] ) ) { + $isLocalObj = $result['isLocalObj']; + } + } + } + + # Finish mangling title and then check for loops. + # Set $title to a Title object and $titleText to the PDBK + if ( !$found ) { + $ns = NS_TEMPLATE; + # Split the title into page and subpage + $subpage = ''; + $relative = $this->maybeDoSubpageLink( $part1, $subpage ); + if ( $part1 !== $relative ) { + $part1 = $relative; + $ns = $this->mTitle->getNamespace(); + } + $title = Title::newFromText( $part1, $ns ); + if ( $title ) { + $titleText = $title->getPrefixedText(); + # Check for language variants if the template is not found + if ( $this->getConverterLanguage()->hasVariants() && $title->getArticleID() == 0 ) { + $this->getConverterLanguage()->findVariantLink( $part1, $title, true ); + } + # Do recursion depth check + $limit = $this->mOptions->getMaxTemplateDepth(); + if ( $frame->depth >= $limit ) { + $found = true; + $text = '<span class="error">' + . wfMessage( 'parser-template-recursion-depth-warning' ) + ->numParams( $limit )->inContentLanguage()->text() + . '</span>'; + } + } + } + + # Load from database + if ( !$found && $title ) { + $profileSection = $this->mProfiler->scopedProfileIn( $title->getPrefixedDBkey() ); + if ( !$title->isExternal() ) { + if ( $title->isSpecialPage() + && $this->mOptions->getAllowSpecialInclusion() + && $this->ot['html'] + ) { + $specialPage = SpecialPageFactory::getPage( $title->getDBkey() ); + // Pass the template arguments as URL parameters. + // "uselang" will have no effect since the Language object + // is forced to the one defined in ParserOptions. + $pageArgs = []; + $argsLength = $args->getLength(); + for ( $i = 0; $i < $argsLength; $i++ ) { + $bits = $args->item( $i )->splitArg(); + if ( strval( $bits['index'] ) === '' ) { + $name = trim( $frame->expand( $bits['name'], PPFrame::STRIP_COMMENTS ) ); + $value = trim( $frame->expand( $bits['value'] ) ); + $pageArgs[$name] = $value; + } + } + + // Create a new context to execute the special page + $context = new RequestContext; + $context->setTitle( $title ); + $context->setRequest( new FauxRequest( $pageArgs ) ); + if ( $specialPage && $specialPage->maxIncludeCacheTime() === 0 ) { + $context->setUser( $this->getUser() ); + } else { + // If this page is cached, then we better not be per user. + $context->setUser( User::newFromName( '127.0.0.1', false ) ); + } + $context->setLanguage( $this->mOptions->getUserLangObj() ); + $ret = SpecialPageFactory::capturePath( + $title, $context, $this->getLinkRenderer() ); + if ( $ret ) { + $text = $context->getOutput()->getHTML(); + $this->mOutput->addOutputPageMetadata( $context->getOutput() ); + $found = true; + $isHTML = true; + if ( $specialPage && $specialPage->maxIncludeCacheTime() !== false ) { + $this->mOutput->updateRuntimeAdaptiveExpiry( + $specialPage->maxIncludeCacheTime() + ); + } + } + } elseif ( MWNamespace::isNonincludable( $title->getNamespace() ) ) { + $found = false; # access denied + wfDebug( __METHOD__ . ": template inclusion denied for " . + $title->getPrefixedDBkey() . "\n" ); + } else { + list( $text, $title ) = $this->getTemplateDom( $title ); + if ( $text !== false ) { + $found = true; + $isChildObj = true; + } + } + + # If the title is valid but undisplayable, make a link to it + if ( !$found && ( $this->ot['html'] || $this->ot['pre'] ) ) { + $text = "[[:$titleText]]"; + $found = true; + } + } elseif ( $title->isTrans() ) { + # Interwiki transclusion + if ( $this->ot['html'] && !$forceRawInterwiki ) { + $text = $this->interwikiTransclude( $title, 'render' ); + $isHTML = true; + } else { + $text = $this->interwikiTransclude( $title, 'raw' ); + # Preprocess it like a template + $text = $this->preprocessToDom( $text, self::PTD_FOR_INCLUSION ); + $isChildObj = true; + } + $found = true; + } + + # Do infinite loop check + # This has to be done after redirect resolution to avoid infinite loops via redirects + if ( !$frame->loopCheck( $title ) ) { + $found = true; + $text = '<span class="error">' + . wfMessage( 'parser-template-loop-warning', $titleText )->inContentLanguage()->text() + . '</span>'; + $this->addTrackingCategory( 'template-loop-category' ); + $this->mOutput->addWarning( wfMessage( 'template-loop-warning', + wfEscapeWikiText( $titleText ) )->text() ); + wfDebug( __METHOD__ . ": template loop broken at '$titleText'\n" ); + } + } + + # If we haven't found text to substitute by now, we're done + # Recover the source wikitext and return it + if ( !$found ) { + $text = $frame->virtualBracketedImplode( '{{', '|', '}}', $titleWithSpaces, $args ); + if ( $profileSection ) { + $this->mProfiler->scopedProfileOut( $profileSection ); + } + return [ 'object' => $text ]; + } + + # Expand DOM-style return values in a child frame + if ( $isChildObj ) { + # Clean up argument array + $newFrame = $frame->newChild( $args, $title ); + + if ( $nowiki ) { + $text = $newFrame->expand( $text, PPFrame::RECOVER_ORIG ); + } elseif ( $titleText !== false && $newFrame->isEmpty() ) { + # Expansion is eligible for the empty-frame cache + $text = $newFrame->cachedExpand( $titleText, $text ); + } else { + # Uncached expansion + $text = $newFrame->expand( $text ); + } + } + if ( $isLocalObj && $nowiki ) { + $text = $frame->expand( $text, PPFrame::RECOVER_ORIG ); + $isLocalObj = false; + } + + if ( $profileSection ) { + $this->mProfiler->scopedProfileOut( $profileSection ); + } + + # Replace raw HTML by a placeholder + if ( $isHTML ) { + $text = $this->insertStripItem( $text ); + } elseif ( $nowiki && ( $this->ot['html'] || $this->ot['pre'] ) ) { + # Escape nowiki-style return values + $text = wfEscapeWikiText( $text ); + } elseif ( is_string( $text ) + && !$piece['lineStart'] + && preg_match( '/^(?:{\\||:|;|#|\*)/', $text ) + ) { + # T2529: if the template begins with a table or block-level + # element, it should be treated as beginning a new line. + # This behavior is somewhat controversial. + $text = "\n" . $text; + } + + if ( is_string( $text ) && !$this->incrementIncludeSize( 'post-expand', strlen( $text ) ) ) { + # Error, oversize inclusion + if ( $titleText !== false ) { + # Make a working, properly escaped link if possible (T25588) + $text = "[[:$titleText]]"; + } else { + # This will probably not be a working link, but at least it may + # provide some hint of where the problem is + preg_replace( '/^:/', '', $originalTitle ); + $text = "[[:$originalTitle]]"; + } + $text .= $this->insertStripItem( '<!-- WARNING: template omitted, ' + . 'post-expand include size too large -->' ); + $this->limitationWarn( 'post-expand-template-inclusion' ); + } + + if ( $isLocalObj ) { + $ret = [ 'object' => $text ]; + } else { + $ret = [ 'text' => $text ]; + } + + return $ret; + } + + /** + * Call a parser function and return an array with text and flags. + * + * The returned array will always contain a boolean 'found', indicating + * whether the parser function was found or not. It may also contain the + * following: + * text: string|object, resulting wikitext or PP DOM object + * isHTML: bool, $text is HTML, armour it against wikitext transformation + * isChildObj: bool, $text is a DOM node needing expansion in a child frame + * isLocalObj: bool, $text is a DOM node needing expansion in the current frame + * nowiki: bool, wiki markup in $text should be escaped + * + * @since 1.21 + * @param PPFrame $frame The current frame, contains template arguments + * @param string $function Function name + * @param array $args Arguments to the function + * @throws MWException + * @return array + */ + public function callParserFunction( $frame, $function, array $args = [] ) { + global $wgContLang; + + # Case sensitive functions + if ( isset( $this->mFunctionSynonyms[1][$function] ) ) { + $function = $this->mFunctionSynonyms[1][$function]; + } else { + # Case insensitive functions + $function = $wgContLang->lc( $function ); + if ( isset( $this->mFunctionSynonyms[0][$function] ) ) { + $function = $this->mFunctionSynonyms[0][$function]; + } else { + return [ 'found' => false ]; + } + } + + list( $callback, $flags ) = $this->mFunctionHooks[$function]; + + // Avoid PHP 7.1 warning from passing $this by reference + $parser = $this; + + $allArgs = [ &$parser ]; + if ( $flags & self::SFH_OBJECT_ARGS ) { + # Convert arguments to PPNodes and collect for appending to $allArgs + $funcArgs = []; + foreach ( $args as $k => $v ) { + if ( $v instanceof PPNode || $k === 0 ) { + $funcArgs[] = $v; + } else { + $funcArgs[] = $this->mPreprocessor->newPartNodeArray( [ $k => $v ] )->item( 0 ); + } + } + + # Add a frame parameter, and pass the arguments as an array + $allArgs[] = $frame; + $allArgs[] = $funcArgs; + } else { + # Convert arguments to plain text and append to $allArgs + foreach ( $args as $k => $v ) { + if ( $v instanceof PPNode ) { + $allArgs[] = trim( $frame->expand( $v ) ); + } elseif ( is_int( $k ) && $k >= 0 ) { + $allArgs[] = trim( $v ); + } else { + $allArgs[] = trim( "$k=$v" ); + } + } + } + + $result = call_user_func_array( $callback, $allArgs ); + + # The interface for function hooks allows them to return a wikitext + # string or an array containing the string and any flags. This mungs + # things around to match what this method should return. + if ( !is_array( $result ) ) { + $result = [ + 'found' => true, + 'text' => $result, + ]; + } else { + if ( isset( $result[0] ) && !isset( $result['text'] ) ) { + $result['text'] = $result[0]; + } + unset( $result[0] ); + $result += [ + 'found' => true, + ]; + } + + $noparse = true; + $preprocessFlags = 0; + if ( isset( $result['noparse'] ) ) { + $noparse = $result['noparse']; + } + if ( isset( $result['preprocessFlags'] ) ) { + $preprocessFlags = $result['preprocessFlags']; + } + + if ( !$noparse ) { + $result['text'] = $this->preprocessToDom( $result['text'], $preprocessFlags ); + $result['isChildObj'] = true; + } + + return $result; + } + + /** + * Get the semi-parsed DOM representation of a template with a given title, + * and its redirect destination title. Cached. + * + * @param Title $title + * + * @return array + */ + public function getTemplateDom( $title ) { + $cacheTitle = $title; + $titleText = $title->getPrefixedDBkey(); + + if ( isset( $this->mTplRedirCache[$titleText] ) ) { + list( $ns, $dbk ) = $this->mTplRedirCache[$titleText]; + $title = Title::makeTitle( $ns, $dbk ); + $titleText = $title->getPrefixedDBkey(); + } + if ( isset( $this->mTplDomCache[$titleText] ) ) { + return [ $this->mTplDomCache[$titleText], $title ]; + } + + # Cache miss, go to the database + list( $text, $title ) = $this->fetchTemplateAndTitle( $title ); + + if ( $text === false ) { + $this->mTplDomCache[$titleText] = false; + return [ false, $title ]; + } + + $dom = $this->preprocessToDom( $text, self::PTD_FOR_INCLUSION ); + $this->mTplDomCache[$titleText] = $dom; + + if ( !$title->equals( $cacheTitle ) ) { + $this->mTplRedirCache[$cacheTitle->getPrefixedDBkey()] = + [ $title->getNamespace(), $title->getDBkey() ]; + } + + return [ $dom, $title ]; + } + + /** + * Fetch the current revision of a given title. Note that the revision + * (and even the title) may not exist in the database, so everything + * contributing to the output of the parser should use this method + * where possible, rather than getting the revisions themselves. This + * method also caches its results, so using it benefits performance. + * + * @since 1.24 + * @param Title $title + * @return Revision + */ + public function fetchCurrentRevisionOfTitle( $title ) { + $cacheKey = $title->getPrefixedDBkey(); + if ( !$this->currentRevisionCache ) { + $this->currentRevisionCache = new MapCacheLRU( 100 ); + } + if ( !$this->currentRevisionCache->has( $cacheKey ) ) { + $this->currentRevisionCache->set( $cacheKey, + // Defaults to Parser::statelessFetchRevision() + call_user_func( $this->mOptions->getCurrentRevisionCallback(), $title, $this ) + ); + } + return $this->currentRevisionCache->get( $cacheKey ); + } + + /** + * Wrapper around Revision::newFromTitle to allow passing additional parameters + * without passing them on to it. + * + * @since 1.24 + * @param Title $title + * @param Parser|bool $parser + * @return Revision|bool False if missing + */ + public static function statelessFetchRevision( Title $title, $parser = false ) { + $rev = Revision::newKnownCurrent( wfGetDB( DB_REPLICA ), $title ); + + return $rev; + } + + /** + * Fetch the unparsed text of a template and register a reference to it. + * @param Title $title + * @return array ( string or false, Title ) + */ + public function fetchTemplateAndTitle( $title ) { + // Defaults to Parser::statelessFetchTemplate() + $templateCb = $this->mOptions->getTemplateCallback(); + $stuff = call_user_func( $templateCb, $title, $this ); + // We use U+007F DELETE to distinguish strip markers from regular text. + $text = $stuff['text']; + if ( is_string( $stuff['text'] ) ) { + $text = strtr( $text, "\x7f", "?" ); + } + $finalTitle = isset( $stuff['finalTitle'] ) ? $stuff['finalTitle'] : $title; + if ( isset( $stuff['deps'] ) ) { + foreach ( $stuff['deps'] as $dep ) { + $this->mOutput->addTemplate( $dep['title'], $dep['page_id'], $dep['rev_id'] ); + if ( $dep['title']->equals( $this->getTitle() ) ) { + // If we transclude ourselves, the final result + // will change based on the new version of the page + $this->mOutput->setFlag( 'vary-revision' ); + } + } + } + return [ $text, $finalTitle ]; + } + + /** + * Fetch the unparsed text of a template and register a reference to it. + * @param Title $title + * @return string|bool + */ + public function fetchTemplate( $title ) { + return $this->fetchTemplateAndTitle( $title )[0]; + } + + /** + * Static function to get a template + * Can be overridden via ParserOptions::setTemplateCallback(). + * + * @param Title $title + * @param bool|Parser $parser + * + * @return array + */ + public static function statelessFetchTemplate( $title, $parser = false ) { + $text = $skip = false; + $finalTitle = $title; + $deps = []; + + # Loop to fetch the article, with up to 1 redirect + // phpcs:ignore Generic.CodeAnalysis.ForLoopWithTestFunctionCall + for ( $i = 0; $i < 2 && is_object( $title ); $i++ ) { + # Give extensions a chance to select the revision instead + $id = false; # Assume current + Hooks::run( 'BeforeParserFetchTemplateAndtitle', + [ $parser, $title, &$skip, &$id ] ); + + if ( $skip ) { + $text = false; + $deps[] = [ + 'title' => $title, + 'page_id' => $title->getArticleID(), + 'rev_id' => null + ]; + break; + } + # Get the revision + if ( $id ) { + $rev = Revision::newFromId( $id ); + } elseif ( $parser ) { + $rev = $parser->fetchCurrentRevisionOfTitle( $title ); + } else { + $rev = Revision::newFromTitle( $title ); + } + $rev_id = $rev ? $rev->getId() : 0; + # If there is no current revision, there is no page + if ( $id === false && !$rev ) { + $linkCache = LinkCache::singleton(); + $linkCache->addBadLinkObj( $title ); + } + + $deps[] = [ + 'title' => $title, + 'page_id' => $title->getArticleID(), + 'rev_id' => $rev_id ]; + if ( $rev && !$title->equals( $rev->getTitle() ) ) { + # We fetched a rev from a different title; register it too... + $deps[] = [ + 'title' => $rev->getTitle(), + 'page_id' => $rev->getPage(), + 'rev_id' => $rev_id ]; + } + + if ( $rev ) { + $content = $rev->getContent(); + $text = $content ? $content->getWikitextForTransclusion() : null; + + Hooks::run( 'ParserFetchTemplate', + [ $parser, $title, $rev, &$text, &$deps ] ); + + if ( $text === false || $text === null ) { + $text = false; + break; + } + } elseif ( $title->getNamespace() == NS_MEDIAWIKI ) { + global $wgContLang; + $message = wfMessage( $wgContLang->lcfirst( $title->getText() ) )->inContentLanguage(); + if ( !$message->exists() ) { + $text = false; + break; + } + $content = $message->content(); + $text = $message->plain(); + } else { + break; + } + if ( !$content ) { + break; + } + # Redirect? + $finalTitle = $title; + $title = $content->getRedirectTarget(); + } + return [ + 'text' => $text, + 'finalTitle' => $finalTitle, + 'deps' => $deps ]; + } + + /** + * Fetch a file and its title and register a reference to it. + * If 'broken' is a key in $options then the file will appear as a broken thumbnail. + * @param Title $title + * @param array $options Array of options to RepoGroup::findFile + * @return File|bool + */ + public function fetchFile( $title, $options = [] ) { + return $this->fetchFileAndTitle( $title, $options )[0]; + } + + /** + * Fetch a file and its title and register a reference to it. + * If 'broken' is a key in $options then the file will appear as a broken thumbnail. + * @param Title $title + * @param array $options Array of options to RepoGroup::findFile + * @return array ( File or false, Title of file ) + */ + public function fetchFileAndTitle( $title, $options = [] ) { + $file = $this->fetchFileNoRegister( $title, $options ); + + $time = $file ? $file->getTimestamp() : false; + $sha1 = $file ? $file->getSha1() : false; + # Register the file as a dependency... + $this->mOutput->addImage( $title->getDBkey(), $time, $sha1 ); + if ( $file && !$title->equals( $file->getTitle() ) ) { + # Update fetched file title + $title = $file->getTitle(); + $this->mOutput->addImage( $title->getDBkey(), $time, $sha1 ); + } + return [ $file, $title ]; + } + + /** + * Helper function for fetchFileAndTitle. + * + * Also useful if you need to fetch a file but not use it yet, + * for example to get the file's handler. + * + * @param Title $title + * @param array $options Array of options to RepoGroup::findFile + * @return File|bool + */ + protected function fetchFileNoRegister( $title, $options = [] ) { + if ( isset( $options['broken'] ) ) { + $file = false; // broken thumbnail forced by hook + } elseif ( isset( $options['sha1'] ) ) { // get by (sha1,timestamp) + $file = RepoGroup::singleton()->findFileFromKey( $options['sha1'], $options ); + } else { // get by (name,timestamp) + $file = wfFindFile( $title, $options ); + } + return $file; + } + + /** + * Transclude an interwiki link. + * + * @param Title $title + * @param string $action + * + * @return string + */ + public function interwikiTransclude( $title, $action ) { + global $wgEnableScaryTranscluding; + + if ( !$wgEnableScaryTranscluding ) { + return wfMessage( 'scarytranscludedisabled' )->inContentLanguage()->text(); + } + + $url = $title->getFullURL( [ 'action' => $action ] ); + + if ( strlen( $url ) > 255 ) { + return wfMessage( 'scarytranscludetoolong' )->inContentLanguage()->text(); + } + return $this->fetchScaryTemplateMaybeFromCache( $url ); + } + + /** + * @param string $url + * @return mixed|string + */ + public function fetchScaryTemplateMaybeFromCache( $url ) { + global $wgTranscludeCacheExpiry; + $dbr = wfGetDB( DB_REPLICA ); + $tsCond = $dbr->timestamp( time() - $wgTranscludeCacheExpiry ); + $obj = $dbr->selectRow( 'transcache', [ 'tc_time', 'tc_contents' ], + [ 'tc_url' => $url, "tc_time >= " . $dbr->addQuotes( $tsCond ) ] ); + if ( $obj ) { + return $obj->tc_contents; + } + + $req = MWHttpRequest::factory( $url, [], __METHOD__ ); + $status = $req->execute(); // Status object + if ( $status->isOK() ) { + $text = $req->getContent(); + } elseif ( $req->getStatus() != 200 ) { + // Though we failed to fetch the content, this status is useless. + return wfMessage( 'scarytranscludefailed-httpstatus' ) + ->params( $url, $req->getStatus() /* HTTP status */ )->inContentLanguage()->text(); + } else { + return wfMessage( 'scarytranscludefailed', $url )->inContentLanguage()->text(); + } + + $dbw = wfGetDB( DB_MASTER ); + $dbw->replace( 'transcache', [ 'tc_url' ], [ + 'tc_url' => $url, + 'tc_time' => $dbw->timestamp( time() ), + 'tc_contents' => $text + ] ); + return $text; + } + + /** + * Triple brace replacement -- used for template arguments + * @private + * + * @param array $piece + * @param PPFrame $frame + * + * @return array + */ + public function argSubstitution( $piece, $frame ) { + $error = false; + $parts = $piece['parts']; + $nameWithSpaces = $frame->expand( $piece['title'] ); + $argName = trim( $nameWithSpaces ); + $object = false; + $text = $frame->getArgument( $argName ); + if ( $text === false && $parts->getLength() > 0 + && ( $this->ot['html'] + || $this->ot['pre'] + || ( $this->ot['wiki'] && $frame->isTemplate() ) + ) + ) { + # No match in frame, use the supplied default + $object = $parts->item( 0 )->getChildren(); + } + if ( !$this->incrementIncludeSize( 'arg', strlen( $text ) ) ) { + $error = '<!-- WARNING: argument omitted, expansion size too large -->'; + $this->limitationWarn( 'post-expand-template-argument' ); + } + + if ( $text === false && $object === false ) { + # No match anywhere + $object = $frame->virtualBracketedImplode( '{{{', '|', '}}}', $nameWithSpaces, $parts ); + } + if ( $error !== false ) { + $text .= $error; + } + if ( $object !== false ) { + $ret = [ 'object' => $object ]; + } else { + $ret = [ 'text' => $text ]; + } + + return $ret; + } + + /** + * Return the text to be used for a given extension tag. + * This is the ghost of strip(). + * + * @param array $params Associative array of parameters: + * name PPNode for the tag name + * attr PPNode for unparsed text where tag attributes are thought to be + * attributes Optional associative array of parsed attributes + * inner Contents of extension element + * noClose Original text did not have a close tag + * @param PPFrame $frame + * + * @throws MWException + * @return string + */ + public function extensionSubstitution( $params, $frame ) { + static $errorStr = '<span class="error">'; + static $errorLen = 20; + + $name = $frame->expand( $params['name'] ); + if ( substr( $name, 0, $errorLen ) === $errorStr ) { + // Probably expansion depth or node count exceeded. Just punt the + // error up. + return $name; + } + + $attrText = !isset( $params['attr'] ) ? null : $frame->expand( $params['attr'] ); + if ( substr( $attrText, 0, $errorLen ) === $errorStr ) { + // See above + return $attrText; + } + + // We can't safely check if the expansion for $content resulted in an + // error, because the content could happen to be the error string + // (T149622). + $content = !isset( $params['inner'] ) ? null : $frame->expand( $params['inner'] ); + + $marker = self::MARKER_PREFIX . "-$name-" + . sprintf( '%08X', $this->mMarkerIndex++ ) . self::MARKER_SUFFIX; + + $isFunctionTag = isset( $this->mFunctionTagHooks[strtolower( $name )] ) && + ( $this->ot['html'] || $this->ot['pre'] ); + if ( $isFunctionTag ) { + $markerType = 'none'; + } else { + $markerType = 'general'; + } + if ( $this->ot['html'] || $isFunctionTag ) { + $name = strtolower( $name ); + $attributes = Sanitizer::decodeTagAttributes( $attrText ); + if ( isset( $params['attributes'] ) ) { + $attributes = $attributes + $params['attributes']; + } + + if ( isset( $this->mTagHooks[$name] ) ) { + $output = call_user_func_array( $this->mTagHooks[$name], + [ $content, $attributes, $this, $frame ] ); + } elseif ( isset( $this->mFunctionTagHooks[$name] ) ) { + list( $callback, ) = $this->mFunctionTagHooks[$name]; + + // Avoid PHP 7.1 warning from passing $this by reference + $parser = $this; + $output = call_user_func_array( $callback, [ &$parser, $frame, $content, $attributes ] ); + } else { + $output = '<span class="error">Invalid tag extension name: ' . + htmlspecialchars( $name ) . '</span>'; + } + + if ( is_array( $output ) ) { + // Extract flags + $flags = $output; + $output = $flags[0]; + if ( isset( $flags['markerType'] ) ) { + $markerType = $flags['markerType']; + } + } + } else { + if ( is_null( $attrText ) ) { + $attrText = ''; + } + if ( isset( $params['attributes'] ) ) { + foreach ( $params['attributes'] as $attrName => $attrValue ) { + $attrText .= ' ' . htmlspecialchars( $attrName ) . '="' . + htmlspecialchars( $attrValue ) . '"'; + } + } + if ( $content === null ) { + $output = "<$name$attrText/>"; + } else { + $close = is_null( $params['close'] ) ? '' : $frame->expand( $params['close'] ); + if ( substr( $close, 0, $errorLen ) === $errorStr ) { + // See above + return $close; + } + $output = "<$name$attrText>$content$close"; + } + } + + if ( $markerType === 'none' ) { + return $output; + } elseif ( $markerType === 'nowiki' ) { + $this->mStripState->addNoWiki( $marker, $output ); + } elseif ( $markerType === 'general' ) { + $this->mStripState->addGeneral( $marker, $output ); + } else { + throw new MWException( __METHOD__ . ': invalid marker type' ); + } + return $marker; + } + + /** + * Increment an include size counter + * + * @param string $type The type of expansion + * @param int $size The size of the text + * @return bool False if this inclusion would take it over the maximum, true otherwise + */ + public function incrementIncludeSize( $type, $size ) { + if ( $this->mIncludeSizes[$type] + $size > $this->mOptions->getMaxIncludeSize() ) { + return false; + } else { + $this->mIncludeSizes[$type] += $size; + return true; + } + } + + /** + * Increment the expensive function count + * + * @return bool False if the limit has been exceeded + */ + public function incrementExpensiveFunctionCount() { + $this->mExpensiveFunctionCount++; + return $this->mExpensiveFunctionCount <= $this->mOptions->getExpensiveParserFunctionLimit(); + } + + /** + * Strip double-underscore items like __NOGALLERY__ and __NOTOC__ + * Fills $this->mDoubleUnderscores, returns the modified text + * + * @param string $text + * + * @return string + */ + public function doDoubleUnderscore( $text ) { + # The position of __TOC__ needs to be recorded + $mw = MagicWord::get( 'toc' ); + if ( $mw->match( $text ) ) { + $this->mShowToc = true; + $this->mForceTocPosition = true; + + # Set a placeholder. At the end we'll fill it in with the TOC. + $text = $mw->replace( '<!--MWTOC\'"-->', $text, 1 ); + + # Only keep the first one. + $text = $mw->replace( '', $text ); + } + + # Now match and remove the rest of them + $mwa = MagicWord::getDoubleUnderscoreArray(); + $this->mDoubleUnderscores = $mwa->matchAndRemove( $text ); + + if ( isset( $this->mDoubleUnderscores['nogallery'] ) ) { + $this->mOutput->mNoGallery = true; + } + if ( isset( $this->mDoubleUnderscores['notoc'] ) && !$this->mForceTocPosition ) { + $this->mShowToc = false; + } + if ( isset( $this->mDoubleUnderscores['hiddencat'] ) + && $this->mTitle->getNamespace() == NS_CATEGORY + ) { + $this->addTrackingCategory( 'hidden-category-category' ); + } + # (T10068) Allow control over whether robots index a page. + # __INDEX__ always overrides __NOINDEX__, see T16899 + if ( isset( $this->mDoubleUnderscores['noindex'] ) && $this->mTitle->canUseNoindex() ) { + $this->mOutput->setIndexPolicy( 'noindex' ); + $this->addTrackingCategory( 'noindex-category' ); + } + if ( isset( $this->mDoubleUnderscores['index'] ) && $this->mTitle->canUseNoindex() ) { + $this->mOutput->setIndexPolicy( 'index' ); + $this->addTrackingCategory( 'index-category' ); + } + + # Cache all double underscores in the database + foreach ( $this->mDoubleUnderscores as $key => $val ) { + $this->mOutput->setProperty( $key, '' ); + } + + return $text; + } + + /** + * @see ParserOutput::addTrackingCategory() + * @param string $msg Message key + * @return bool Whether the addition was successful + */ + public function addTrackingCategory( $msg ) { + return $this->mOutput->addTrackingCategory( $msg, $this->mTitle ); + } + + /** + * This function accomplishes several tasks: + * 1) Auto-number headings if that option is enabled + * 2) Add an [edit] link to sections for users who have enabled the option and can edit the page + * 3) Add a Table of contents on the top for users who have enabled the option + * 4) Auto-anchor headings + * + * It loops through all headlines, collects the necessary data, then splits up the + * string and re-inserts the newly formatted headlines. + * + * @param string $text + * @param string $origText Original, untouched wikitext + * @param bool $isMain + * @return mixed|string + * @private + */ + public function formatHeadings( $text, $origText, $isMain = true ) { + global $wgMaxTocLevel; + + # Inhibit editsection links if requested in the page + if ( isset( $this->mDoubleUnderscores['noeditsection'] ) ) { + $maybeShowEditLink = false; + } else { + $maybeShowEditLink = true; /* Actual presence will depend on post-cache transforms */ + } + + # Get all headlines for numbering them and adding funky stuff like [edit] + # links - this is for later, but we need the number of headlines right now + # NOTE: white space in headings have been trimmed in doHeadings. They shouldn't + # be trimmed here since whitespace in HTML headings is significant. + $matches = []; + $numMatches = preg_match_all( + '/<H(?P<level>[1-6])(?P<attrib>.*?>)(?P<header>[\s\S]*?)<\/H[1-6] *>/i', + $text, + $matches + ); + + # if there are fewer than 4 headlines in the article, do not show TOC + # unless it's been explicitly enabled. + $enoughToc = $this->mShowToc && + ( ( $numMatches >= 4 ) || $this->mForceTocPosition ); + + # Allow user to stipulate that a page should have a "new section" + # link added via __NEWSECTIONLINK__ + if ( isset( $this->mDoubleUnderscores['newsectionlink'] ) ) { + $this->mOutput->setNewSection( true ); + } + + # Allow user to remove the "new section" + # link via __NONEWSECTIONLINK__ + if ( isset( $this->mDoubleUnderscores['nonewsectionlink'] ) ) { + $this->mOutput->hideNewSection( true ); + } + + # if the string __FORCETOC__ (not case-sensitive) occurs in the HTML, + # override above conditions and always show TOC above first header + if ( isset( $this->mDoubleUnderscores['forcetoc'] ) ) { + $this->mShowToc = true; + $enoughToc = true; + } + + # headline counter + $headlineCount = 0; + $numVisible = 0; + + # Ugh .. the TOC should have neat indentation levels which can be + # passed to the skin functions. These are determined here + $toc = ''; + $full = ''; + $head = []; + $sublevelCount = []; + $levelCount = []; + $level = 0; + $prevlevel = 0; + $toclevel = 0; + $prevtoclevel = 0; + $markerRegex = self::MARKER_PREFIX . "-h-(\d+)-" . self::MARKER_SUFFIX; + $baseTitleText = $this->mTitle->getPrefixedDBkey(); + $oldType = $this->mOutputType; + $this->setOutputType( self::OT_WIKI ); + $frame = $this->getPreprocessor()->newFrame(); + $root = $this->preprocessToDom( $origText ); + $node = $root->getFirstChild(); + $byteOffset = 0; + $tocraw = []; + $refers = []; + + $headlines = $numMatches !== false ? $matches[3] : []; + + foreach ( $headlines as $headline ) { + $isTemplate = false; + $titleText = false; + $sectionIndex = false; + $numbering = ''; + $markerMatches = []; + if ( preg_match( "/^$markerRegex/", $headline, $markerMatches ) ) { + $serial = $markerMatches[1]; + list( $titleText, $sectionIndex ) = $this->mHeadings[$serial]; + $isTemplate = ( $titleText != $baseTitleText ); + $headline = preg_replace( "/^$markerRegex\\s*/", "", $headline ); + } + + if ( $toclevel ) { + $prevlevel = $level; + } + $level = $matches[1][$headlineCount]; + + if ( $level > $prevlevel ) { + # Increase TOC level + $toclevel++; + $sublevelCount[$toclevel] = 0; + if ( $toclevel < $wgMaxTocLevel ) { + $prevtoclevel = $toclevel; + $toc .= Linker::tocIndent(); + $numVisible++; + } + } elseif ( $level < $prevlevel && $toclevel > 1 ) { + # Decrease TOC level, find level to jump to + + for ( $i = $toclevel; $i > 0; $i-- ) { + if ( $levelCount[$i] == $level ) { + # Found last matching level + $toclevel = $i; + break; + } elseif ( $levelCount[$i] < $level ) { + # Found first matching level below current level + $toclevel = $i + 1; + break; + } + } + if ( $i == 0 ) { + $toclevel = 1; + } + if ( $toclevel < $wgMaxTocLevel ) { + if ( $prevtoclevel < $wgMaxTocLevel ) { + # Unindent only if the previous toc level was shown :p + $toc .= Linker::tocUnindent( $prevtoclevel - $toclevel ); + $prevtoclevel = $toclevel; + } else { + $toc .= Linker::tocLineEnd(); + } + } + } else { + # No change in level, end TOC line + if ( $toclevel < $wgMaxTocLevel ) { + $toc .= Linker::tocLineEnd(); + } + } + + $levelCount[$toclevel] = $level; + + # count number of headlines for each level + $sublevelCount[$toclevel]++; + $dot = 0; + for ( $i = 1; $i <= $toclevel; $i++ ) { + if ( !empty( $sublevelCount[$i] ) ) { + if ( $dot ) { + $numbering .= '.'; + } + $numbering .= $this->getTargetLanguage()->formatNum( $sublevelCount[$i] ); + $dot = 1; + } + } + + # The safe header is a version of the header text safe to use for links + + # Remove link placeholders by the link text. + # <!--LINK number--> + # turns into + # link text with suffix + # Do this before unstrip since link text can contain strip markers + $safeHeadline = $this->replaceLinkHoldersText( $headline ); + + # Avoid insertion of weird stuff like <math> by expanding the relevant sections + $safeHeadline = $this->mStripState->unstripBoth( $safeHeadline ); + + # Strip out HTML (first regex removes any tag not allowed) + # Allowed tags are: + # * <sup> and <sub> (T10393) + # * <i> (T28375) + # * <b> (r105284) + # * <bdi> (T74884) + # * <span dir="rtl"> and <span dir="ltr"> (T37167) + # * <s> and <strike> (T35715) + # We strip any parameter from accepted tags (second regex), except dir="rtl|ltr" from <span>, + # to allow setting directionality in toc items. + $tocline = preg_replace( + [ + '#<(?!/?(span|sup|sub|bdi|i|b|s|strike)(?: [^>]*)?>).*?>#', + '#<(/?(?:span(?: dir="(?:rtl|ltr)")?|sup|sub|bdi|i|b|s|strike))(?: .*?)?>#' + ], + [ '', '<$1>' ], + $safeHeadline + ); + + # Strip '<span></span>', which is the result from the above if + # <span id="foo"></span> is used to produce an additional anchor + # for a section. + $tocline = str_replace( '<span></span>', '', $tocline ); + + $tocline = trim( $tocline ); + + # For the anchor, strip out HTML-y stuff period + $safeHeadline = preg_replace( '/<.*?>/', '', $safeHeadline ); + $safeHeadline = Sanitizer::normalizeSectionNameWhitespace( $safeHeadline ); + + # Save headline for section edit hint before it's escaped + $headlineHint = $safeHeadline; + + # Decode HTML entities + $safeHeadline = Sanitizer::decodeCharReferences( $safeHeadline ); + + $safeHeadline = self::normalizeSectionName( $safeHeadline ); + + $fallbackHeadline = Sanitizer::escapeIdForAttribute( $safeHeadline, Sanitizer::ID_FALLBACK ); + $linkAnchor = Sanitizer::escapeIdForLink( $safeHeadline ); + $safeHeadline = Sanitizer::escapeIdForAttribute( $safeHeadline, Sanitizer::ID_PRIMARY ); + if ( $fallbackHeadline === $safeHeadline ) { + # No reason to have both (in fact, we can't) + $fallbackHeadline = false; + } + + # HTML IDs must be case-insensitively unique for IE compatibility (T12721). + # @todo FIXME: We may be changing them depending on the current locale. + $arrayKey = strtolower( $safeHeadline ); + if ( $fallbackHeadline === false ) { + $fallbackArrayKey = false; + } else { + $fallbackArrayKey = strtolower( $fallbackHeadline ); + } + + # Create the anchor for linking from the TOC to the section + $anchor = $safeHeadline; + $fallbackAnchor = $fallbackHeadline; + if ( isset( $refers[$arrayKey] ) ) { + // phpcs:ignore Generic.CodeAnalysis.ForLoopWithTestFunctionCall,Generic.Formatting.DisallowMultipleStatements + for ( $i = 2; isset( $refers["${arrayKey}_$i"] ); ++$i ); + $anchor .= "_$i"; + $linkAnchor .= "_$i"; + $refers["${arrayKey}_$i"] = true; + } else { + $refers[$arrayKey] = true; + } + if ( $fallbackHeadline !== false && isset( $refers[$fallbackArrayKey] ) ) { + // phpcs:ignore Generic.CodeAnalysis.ForLoopWithTestFunctionCall,Generic.Formatting.DisallowMultipleStatements + for ( $i = 2; isset( $refers["${fallbackArrayKey}_$i"] ); ++$i ); + $fallbackAnchor .= "_$i"; + $refers["${fallbackArrayKey}_$i"] = true; + } else { + $refers[$fallbackArrayKey] = true; + } + + # Don't number the heading if it is the only one (looks silly) + if ( count( $matches[3] ) > 1 && $this->mOptions->getNumberHeadings() ) { + # the two are different if the line contains a link + $headline = Html::element( + 'span', + [ 'class' => 'mw-headline-number' ], + $numbering + ) . ' ' . $headline; + } + + if ( $enoughToc && ( !isset( $wgMaxTocLevel ) || $toclevel < $wgMaxTocLevel ) ) { + $toc .= Linker::tocLine( $linkAnchor, $tocline, + $numbering, $toclevel, ( $isTemplate ? false : $sectionIndex ) ); + } + + # Add the section to the section tree + # Find the DOM node for this header + $noOffset = ( $isTemplate || $sectionIndex === false ); + while ( $node && !$noOffset ) { + if ( $node->getName() === 'h' ) { + $bits = $node->splitHeading(); + if ( $bits['i'] == $sectionIndex ) { + break; + } + } + $byteOffset += mb_strlen( $this->mStripState->unstripBoth( + $frame->expand( $node, PPFrame::RECOVER_ORIG ) ) ); + $node = $node->getNextSibling(); + } + $tocraw[] = [ + 'toclevel' => $toclevel, + 'level' => $level, + 'line' => $tocline, + 'number' => $numbering, + 'index' => ( $isTemplate ? 'T-' : '' ) . $sectionIndex, + 'fromtitle' => $titleText, + 'byteoffset' => ( $noOffset ? null : $byteOffset ), + 'anchor' => $anchor, + ]; + + # give headline the correct <h#> tag + if ( $maybeShowEditLink && $sectionIndex !== false ) { + // Output edit section links as markers with styles that can be customized by skins + if ( $isTemplate ) { + # Put a T flag in the section identifier, to indicate to extractSections() + # that sections inside <includeonly> should be counted. + $editsectionPage = $titleText; + $editsectionSection = "T-$sectionIndex"; + $editsectionContent = null; + } else { + $editsectionPage = $this->mTitle->getPrefixedText(); + $editsectionSection = $sectionIndex; + $editsectionContent = $headlineHint; + } + // We use a bit of pesudo-xml for editsection markers. The + // language converter is run later on. Using a UNIQ style marker + // leads to the converter screwing up the tokens when it + // converts stuff. And trying to insert strip tags fails too. At + // this point all real inputted tags have already been escaped, + // so we don't have to worry about a user trying to input one of + // these markers directly. We use a page and section attribute + // to stop the language converter from converting these + // important bits of data, but put the headline hint inside a + // content block because the language converter is supposed to + // be able to convert that piece of data. + // Gets replaced with html in ParserOutput::getText + $editlink = '<mw:editsection page="' . htmlspecialchars( $editsectionPage ); + $editlink .= '" section="' . htmlspecialchars( $editsectionSection ) . '"'; + if ( $editsectionContent !== null ) { + $editlink .= '>' . $editsectionContent . '</mw:editsection>'; + } else { + $editlink .= '/>'; + } + } else { + $editlink = ''; + } + $head[$headlineCount] = Linker::makeHeadline( $level, + $matches['attrib'][$headlineCount], $anchor, $headline, + $editlink, $fallbackAnchor ); + + $headlineCount++; + } + + $this->setOutputType( $oldType ); + + # Never ever show TOC if no headers + if ( $numVisible < 1 ) { + $enoughToc = false; + } + + if ( $enoughToc ) { + if ( $prevtoclevel > 0 && $prevtoclevel < $wgMaxTocLevel ) { + $toc .= Linker::tocUnindent( $prevtoclevel - 1 ); + } + $toc = Linker::tocList( $toc, $this->mOptions->getUserLangObj() ); + $this->mOutput->setTOCHTML( $toc ); + $toc = self::TOC_START . $toc . self::TOC_END; + } + + if ( $isMain ) { + $this->mOutput->setSections( $tocraw ); + } + + # split up and insert constructed headlines + $blocks = preg_split( '/<H[1-6].*?>[\s\S]*?<\/H[1-6]>/i', $text ); + $i = 0; + + // build an array of document sections + $sections = []; + foreach ( $blocks as $block ) { + // $head is zero-based, sections aren't. + if ( empty( $head[$i - 1] ) ) { + $sections[$i] = $block; + } else { + $sections[$i] = $head[$i - 1] . $block; + } + + /** + * Send a hook, one per section. + * The idea here is to be able to make section-level DIVs, but to do so in a + * lower-impact, more correct way than r50769 + * + * $this : caller + * $section : the section number + * &$sectionContent : ref to the content of the section + * $maybeShowEditLinks : boolean describing whether this section has an edit link + */ + Hooks::run( 'ParserSectionCreate', [ $this, $i, &$sections[$i], $maybeShowEditLink ] ); + + $i++; + } + + if ( $enoughToc && $isMain && !$this->mForceTocPosition ) { + // append the TOC at the beginning + // Top anchor now in skin + $sections[0] = $sections[0] . $toc . "\n"; + } + + $full .= implode( '', $sections ); + + if ( $this->mForceTocPosition ) { + return str_replace( '<!--MWTOC\'"-->', $toc, $full ); + } else { + return $full; + } + } + + /** + * Transform wiki markup when saving a page by doing "\r\n" -> "\n" + * conversion, substituting signatures, {{subst:}} templates, etc. + * + * @param string $text The text to transform + * @param Title $title The Title object for the current article + * @param User $user The User object describing the current user + * @param ParserOptions $options Parsing options + * @param bool $clearState Whether to clear the parser state first + * @return string The altered wiki markup + */ + public function preSaveTransform( $text, Title $title, User $user, + ParserOptions $options, $clearState = true + ) { + if ( $clearState ) { + $magicScopeVariable = $this->lock(); + } + $this->startParse( $title, $options, self::OT_WIKI, $clearState ); + $this->setUser( $user ); + + // Strip U+0000 NULL (T159174) + $text = str_replace( "\000", '', $text ); + + // We still normalize line endings for backwards-compatibility + // with other code that just calls PST, but this should already + // be handled in TextContent subclasses + $text = TextContent::normalizeLineEndings( $text ); + + if ( $options->getPreSaveTransform() ) { + $text = $this->pstPass2( $text, $user ); + } + $text = $this->mStripState->unstripBoth( $text ); + + $this->setUser( null ); # Reset + + return $text; + } + + /** + * Pre-save transform helper function + * + * @param string $text + * @param User $user + * + * @return string + */ + private function pstPass2( $text, $user ) { + global $wgContLang; + + # Note: This is the timestamp saved as hardcoded wikitext to + # the database, we use $wgContLang here in order to give + # everyone the same signature and use the default one rather + # than the one selected in each user's preferences. + # (see also T14815) + $ts = $this->mOptions->getTimestamp(); + $timestamp = MWTimestamp::getLocalInstance( $ts ); + $ts = $timestamp->format( 'YmdHis' ); + $tzMsg = $timestamp->getTimezoneMessage()->inContentLanguage()->text(); + + $d = $wgContLang->timeanddate( $ts, false, false ) . " ($tzMsg)"; + + # Variable replacement + # Because mOutputType is OT_WIKI, this will only process {{subst:xxx}} type tags + $text = $this->replaceVariables( $text ); + + # This works almost by chance, as the replaceVariables are done before the getUserSig(), + # which may corrupt this parser instance via its wfMessage()->text() call- + + # Signatures + if ( strpos( $text, '~~~' ) !== false ) { + $sigText = $this->getUserSig( $user ); + $text = strtr( $text, [ + '~~~~~' => $d, + '~~~~' => "$sigText $d", + '~~~' => $sigText + ] ); + # The main two signature forms used above are time-sensitive + $this->mOutput->setFlag( 'user-signature' ); + } + + # Context links ("pipe tricks"): [[|name]] and [[name (context)|]] + $tc = '[' . Title::legalChars() . ']'; + $nc = '[ _0-9A-Za-z\x80-\xff-]'; # Namespaces can use non-ascii! + + // [[ns:page (context)|]] + $p1 = "/\[\[(:?$nc+:|:|)($tc+?)( ?\\($tc+\\))\\|]]/"; + // [[ns:page(context)|]] (double-width brackets, added in r40257) + $p4 = "/\[\[(:?$nc+:|:|)($tc+?)( ?($tc+))\\|]]/"; + // [[ns:page (context), context|]] (using either single or double-width comma) + $p3 = "/\[\[(:?$nc+:|:|)($tc+?)( ?\\($tc+\\)|)((?:, |,)$tc+|)\\|]]/"; + // [[|page]] (reverse pipe trick: add context from page title) + $p2 = "/\[\[\\|($tc+)]]/"; + + # try $p1 first, to turn "[[A, B (C)|]]" into "[[A, B (C)|A, B]]" + $text = preg_replace( $p1, '[[\\1\\2\\3|\\2]]', $text ); + $text = preg_replace( $p4, '[[\\1\\2\\3|\\2]]', $text ); + $text = preg_replace( $p3, '[[\\1\\2\\3\\4|\\2]]', $text ); + + $t = $this->mTitle->getText(); + $m = []; + if ( preg_match( "/^($nc+:|)$tc+?( \\($tc+\\))$/", $t, $m ) ) { + $text = preg_replace( $p2, "[[$m[1]\\1$m[2]|\\1]]", $text ); + } elseif ( preg_match( "/^($nc+:|)$tc+?(, $tc+|)$/", $t, $m ) && "$m[1]$m[2]" != '' ) { + $text = preg_replace( $p2, "[[$m[1]\\1$m[2]|\\1]]", $text ); + } else { + # if there's no context, don't bother duplicating the title + $text = preg_replace( $p2, '[[\\1]]', $text ); + } + + return $text; + } + + /** + * Fetch the user's signature text, if any, and normalize to + * validated, ready-to-insert wikitext. + * If you have pre-fetched the nickname or the fancySig option, you can + * specify them here to save a database query. + * Do not reuse this parser instance after calling getUserSig(), + * as it may have changed if it's the $wgParser. + * + * @param User &$user + * @param string|bool $nickname Nickname to use or false to use user's default nickname + * @param bool|null $fancySig whether the nicknname is the complete signature + * or null to use default value + * @return string + */ + public function getUserSig( &$user, $nickname = false, $fancySig = null ) { + global $wgMaxSigChars; + + $username = $user->getName(); + + # If not given, retrieve from the user object. + if ( $nickname === false ) { + $nickname = $user->getOption( 'nickname' ); + } + + if ( is_null( $fancySig ) ) { + $fancySig = $user->getBoolOption( 'fancysig' ); + } + + $nickname = $nickname == null ? $username : $nickname; + + if ( mb_strlen( $nickname ) > $wgMaxSigChars ) { + $nickname = $username; + wfDebug( __METHOD__ . ": $username has overlong signature.\n" ); + } elseif ( $fancySig !== false ) { + # Sig. might contain markup; validate this + if ( $this->validateSig( $nickname ) !== false ) { + # Validated; clean up (if needed) and return it + return $this->cleanSig( $nickname, true ); + } else { + # Failed to validate; fall back to the default + $nickname = $username; + wfDebug( __METHOD__ . ": $username has bad XML tags in signature.\n" ); + } + } + + # Make sure nickname doesnt get a sig in a sig + $nickname = self::cleanSigInSig( $nickname ); + + # If we're still here, make it a link to the user page + $userText = wfEscapeWikiText( $username ); + $nickText = wfEscapeWikiText( $nickname ); + $msgName = $user->isAnon() ? 'signature-anon' : 'signature'; + + return wfMessage( $msgName, $userText, $nickText )->inContentLanguage() + ->title( $this->getTitle() )->text(); + } + + /** + * Check that the user's signature contains no bad XML + * + * @param string $text + * @return string|bool An expanded string, or false if invalid. + */ + public function validateSig( $text ) { + return Xml::isWellFormedXmlFragment( $text ) ? $text : false; + } + + /** + * Clean up signature text + * + * 1) Strip 3, 4 or 5 tildes out of signatures @see cleanSigInSig + * 2) Substitute all transclusions + * + * @param string $text + * @param bool $parsing Whether we're cleaning (preferences save) or parsing + * @return string Signature text + */ + public function cleanSig( $text, $parsing = false ) { + if ( !$parsing ) { + global $wgTitle; + $magicScopeVariable = $this->lock(); + $this->startParse( $wgTitle, new ParserOptions, self::OT_PREPROCESS, true ); + } + + # Option to disable this feature + if ( !$this->mOptions->getCleanSignatures() ) { + return $text; + } + + # @todo FIXME: Regex doesn't respect extension tags or nowiki + # => Move this logic to braceSubstitution() + $substWord = MagicWord::get( 'subst' ); + $substRegex = '/\{\{(?!(?:' . $substWord->getBaseRegex() . '))/x' . $substWord->getRegexCase(); + $substText = '{{' . $substWord->getSynonym( 0 ); + + $text = preg_replace( $substRegex, $substText, $text ); + $text = self::cleanSigInSig( $text ); + $dom = $this->preprocessToDom( $text ); + $frame = $this->getPreprocessor()->newFrame(); + $text = $frame->expand( $dom ); + + if ( !$parsing ) { + $text = $this->mStripState->unstripBoth( $text ); + } + + return $text; + } + + /** + * Strip 3, 4 or 5 tildes out of signatures. + * + * @param string $text + * @return string Signature text with /~{3,5}/ removed + */ + public static function cleanSigInSig( $text ) { + $text = preg_replace( '/~{3,5}/', '', $text ); + return $text; + } + + /** + * Set up some variables which are usually set up in parse() + * so that an external function can call some class members with confidence + * + * @param Title|null $title + * @param ParserOptions $options + * @param int $outputType + * @param bool $clearState + */ + public function startExternalParse( Title $title = null, ParserOptions $options, + $outputType, $clearState = true + ) { + $this->startParse( $title, $options, $outputType, $clearState ); + } + + /** + * @param Title|null $title + * @param ParserOptions $options + * @param int $outputType + * @param bool $clearState + */ + private function startParse( Title $title = null, ParserOptions $options, + $outputType, $clearState = true + ) { + $this->setTitle( $title ); + $this->mOptions = $options; + $this->setOutputType( $outputType ); + if ( $clearState ) { + $this->clearState(); + } + } + + /** + * Wrapper for preprocess() + * + * @param string $text The text to preprocess + * @param ParserOptions $options + * @param Title|null $title Title object or null to use $wgTitle + * @return string + */ + public function transformMsg( $text, $options, $title = null ) { + static $executing = false; + + # Guard against infinite recursion + if ( $executing ) { + return $text; + } + $executing = true; + + if ( !$title ) { + global $wgTitle; + $title = $wgTitle; + } + + $text = $this->preprocess( $text, $title, $options ); + + $executing = false; + return $text; + } + + /** + * Create an HTML-style tag, e.g. "<yourtag>special text</yourtag>" + * The callback should have the following form: + * function myParserHook( $text, $params, $parser, $frame ) { ... } + * + * Transform and return $text. Use $parser for any required context, e.g. use + * $parser->getTitle() and $parser->getOptions() not $wgTitle or $wgOut->mParserOptions + * + * Hooks may return extended information by returning an array, of which the + * first numbered element (index 0) must be the return string, and all other + * entries are extracted into local variables within an internal function + * in the Parser class. + * + * This interface (introduced r61913) appears to be undocumented, but + * 'markerType' is used by some core tag hooks to override which strip + * array their results are placed in. **Use great caution if attempting + * this interface, as it is not documented and injudicious use could smash + * private variables.** + * + * @param string $tag The tag to use, e.g. 'hook' for "<hook>" + * @param callable $callback The callback function (and object) to use for the tag + * @throws MWException + * @return callable|null The old value of the mTagHooks array associated with the hook + */ + public function setHook( $tag, callable $callback ) { + $tag = strtolower( $tag ); + if ( preg_match( '/[<>\r\n]/', $tag, $m ) ) { + throw new MWException( "Invalid character {$m[0]} in setHook('$tag', ...) call" ); + } + $oldVal = isset( $this->mTagHooks[$tag] ) ? $this->mTagHooks[$tag] : null; + $this->mTagHooks[$tag] = $callback; + if ( !in_array( $tag, $this->mStripList ) ) { + $this->mStripList[] = $tag; + } + + return $oldVal; + } + + /** + * As setHook(), but letting the contents be parsed. + * + * Transparent tag hooks are like regular XML-style tag hooks, except they + * operate late in the transformation sequence, on HTML instead of wikitext. + * + * This is probably obsoleted by things dealing with parser frames? + * The only extension currently using it is geoserver. + * + * @since 1.10 + * @todo better document or deprecate this + * + * @param string $tag The tag to use, e.g. 'hook' for "<hook>" + * @param callable $callback The callback function (and object) to use for the tag + * @throws MWException + * @return callable|null The old value of the mTagHooks array associated with the hook + */ + public function setTransparentTagHook( $tag, callable $callback ) { + $tag = strtolower( $tag ); + if ( preg_match( '/[<>\r\n]/', $tag, $m ) ) { + throw new MWException( "Invalid character {$m[0]} in setTransparentHook('$tag', ...) call" ); + } + $oldVal = isset( $this->mTransparentTagHooks[$tag] ) ? $this->mTransparentTagHooks[$tag] : null; + $this->mTransparentTagHooks[$tag] = $callback; + + return $oldVal; + } + + /** + * Remove all tag hooks + */ + public function clearTagHooks() { + $this->mTagHooks = []; + $this->mFunctionTagHooks = []; + $this->mStripList = $this->mDefaultStripList; + } + + /** + * Create a function, e.g. {{sum:1|2|3}} + * The callback function should have the form: + * function myParserFunction( &$parser, $arg1, $arg2, $arg3 ) { ... } + * + * Or with Parser::SFH_OBJECT_ARGS: + * function myParserFunction( $parser, $frame, $args ) { ... } + * + * The callback may either return the text result of the function, or an array with the text + * in element 0, and a number of flags in the other elements. The names of the flags are + * specified in the keys. Valid flags are: + * found The text returned is valid, stop processing the template. This + * is on by default. + * nowiki Wiki markup in the return value should be escaped + * isHTML The returned text is HTML, armour it against wikitext transformation + * + * @param string $id The magic word ID + * @param callable $callback The callback function (and object) to use + * @param int $flags A combination of the following flags: + * Parser::SFH_NO_HASH No leading hash, i.e. {{plural:...}} instead of {{#if:...}} + * + * Parser::SFH_OBJECT_ARGS Pass the template arguments as PPNode objects instead of text. + * This allows for conditional expansion of the parse tree, allowing you to eliminate dead + * branches and thus speed up parsing. It is also possible to analyse the parse tree of + * the arguments, and to control the way they are expanded. + * + * The $frame parameter is a PPFrame. This can be used to produce expanded text from the + * arguments, for instance: + * $text = isset( $args[0] ) ? $frame->expand( $args[0] ) : ''; + * + * For technical reasons, $args[0] is pre-expanded and will be a string. This may change in + * future versions. Please call $frame->expand() on it anyway so that your code keeps + * working if/when this is changed. + * + * If you want whitespace to be trimmed from $args, you need to do it yourself, post- + * expansion. + * + * Please read the documentation in includes/parser/Preprocessor.php for more information + * about the methods available in PPFrame and PPNode. + * + * @throws MWException + * @return string|callable The old callback function for this name, if any + */ + public function setFunctionHook( $id, callable $callback, $flags = 0 ) { + global $wgContLang; + + $oldVal = isset( $this->mFunctionHooks[$id] ) ? $this->mFunctionHooks[$id][0] : null; + $this->mFunctionHooks[$id] = [ $callback, $flags ]; + + # Add to function cache + $mw = MagicWord::get( $id ); + if ( !$mw ) { + throw new MWException( __METHOD__ . '() expecting a magic word identifier.' ); + } + + $synonyms = $mw->getSynonyms(); + $sensitive = intval( $mw->isCaseSensitive() ); + + foreach ( $synonyms as $syn ) { + # Case + if ( !$sensitive ) { + $syn = $wgContLang->lc( $syn ); + } + # Add leading hash + if ( !( $flags & self::SFH_NO_HASH ) ) { + $syn = '#' . $syn; + } + # Remove trailing colon + if ( substr( $syn, -1, 1 ) === ':' ) { + $syn = substr( $syn, 0, -1 ); + } + $this->mFunctionSynonyms[$sensitive][$syn] = $id; + } + return $oldVal; + } + + /** + * Get all registered function hook identifiers + * + * @return array + */ + public function getFunctionHooks() { + return array_keys( $this->mFunctionHooks ); + } + + /** + * Create a tag function, e.g. "<test>some stuff</test>". + * Unlike tag hooks, tag functions are parsed at preprocessor level. + * Unlike parser functions, their content is not preprocessed. + * @param string $tag + * @param callable $callback + * @param int $flags + * @throws MWException + * @return null + */ + public function setFunctionTagHook( $tag, callable $callback, $flags ) { + $tag = strtolower( $tag ); + if ( preg_match( '/[<>\r\n]/', $tag, $m ) ) { + throw new MWException( "Invalid character {$m[0]} in setFunctionTagHook('$tag', ...) call" ); + } + $old = isset( $this->mFunctionTagHooks[$tag] ) ? + $this->mFunctionTagHooks[$tag] : null; + $this->mFunctionTagHooks[$tag] = [ $callback, $flags ]; + + if ( !in_array( $tag, $this->mStripList ) ) { + $this->mStripList[] = $tag; + } + + return $old; + } + + /** + * Replace "<!--LINK-->" link placeholders with actual links, in the buffer + * Placeholders created in Linker::link() + * + * @param string &$text + * @param int $options + */ + public function replaceLinkHolders( &$text, $options = 0 ) { + $this->mLinkHolders->replace( $text ); + } + + /** + * Replace "<!--LINK-->" link placeholders with plain text of links + * (not HTML-formatted). + * + * @param string $text + * @return string + */ + public function replaceLinkHoldersText( $text ) { + return $this->mLinkHolders->replaceText( $text ); + } + + /** + * Renders an image gallery from a text with one line per image. + * text labels may be given by using |-style alternative text. E.g. + * Image:one.jpg|The number "1" + * Image:tree.jpg|A tree + * given as text will return the HTML of a gallery with two images, + * labeled 'The number "1"' and + * 'A tree'. + * + * @param string $text + * @param array $params + * @return string HTML + */ + public function renderImageGallery( $text, $params ) { + $mode = false; + if ( isset( $params['mode'] ) ) { + $mode = $params['mode']; + } + + try { + $ig = ImageGalleryBase::factory( $mode ); + } catch ( Exception $e ) { + // If invalid type set, fallback to default. + $ig = ImageGalleryBase::factory( false ); + } + + $ig->setContextTitle( $this->mTitle ); + $ig->setShowBytes( false ); + $ig->setShowDimensions( false ); + $ig->setShowFilename( false ); + $ig->setParser( $this ); + $ig->setHideBadImages(); + $ig->setAttributes( Sanitizer::validateTagAttributes( $params, 'ul' ) ); + + if ( isset( $params['showfilename'] ) ) { + $ig->setShowFilename( true ); + } else { + $ig->setShowFilename( false ); + } + if ( isset( $params['caption'] ) ) { + $caption = $params['caption']; + $caption = htmlspecialchars( $caption ); + $caption = $this->replaceInternalLinks( $caption ); + $ig->setCaptionHtml( $caption ); + } + if ( isset( $params['perrow'] ) ) { + $ig->setPerRow( $params['perrow'] ); + } + if ( isset( $params['widths'] ) ) { + $ig->setWidths( $params['widths'] ); + } + if ( isset( $params['heights'] ) ) { + $ig->setHeights( $params['heights'] ); + } + $ig->setAdditionalOptions( $params ); + + // Avoid PHP 7.1 warning from passing $this by reference + $parser = $this; + Hooks::run( 'BeforeParserrenderImageGallery', [ &$parser, &$ig ] ); + + $lines = StringUtils::explode( "\n", $text ); + foreach ( $lines as $line ) { + # match lines like these: + # Image:someimage.jpg|This is some image + $matches = []; + preg_match( "/^([^|]+)(\\|(.*))?$/", $line, $matches ); + # Skip empty lines + if ( count( $matches ) == 0 ) { + continue; + } + + if ( strpos( $matches[0], '%' ) !== false ) { + $matches[1] = rawurldecode( $matches[1] ); + } + $title = Title::newFromText( $matches[1], NS_FILE ); + if ( is_null( $title ) ) { + # Bogus title. Ignore these so we don't bomb out later. + continue; + } + + # We need to get what handler the file uses, to figure out parameters. + # Note, a hook can overide the file name, and chose an entirely different + # file (which potentially could be of a different type and have different handler). + $options = []; + $descQuery = false; + Hooks::run( 'BeforeParserFetchFileAndTitle', + [ $this, $title, &$options, &$descQuery ] ); + # Don't register it now, as TraditionalImageGallery does that later. + $file = $this->fetchFileNoRegister( $title, $options ); + $handler = $file ? $file->getHandler() : false; + + $paramMap = [ + 'img_alt' => 'gallery-internal-alt', + 'img_link' => 'gallery-internal-link', + ]; + if ( $handler ) { + $paramMap = $paramMap + $handler->getParamMap(); + // We don't want people to specify per-image widths. + // Additionally the width parameter would need special casing anyhow. + unset( $paramMap['img_width'] ); + } + + $mwArray = new MagicWordArray( array_keys( $paramMap ) ); + + $label = ''; + $alt = ''; + $link = ''; + $handlerOptions = []; + if ( isset( $matches[3] ) ) { + // look for an |alt= definition while trying not to break existing + // captions with multiple pipes (|) in it, until a more sensible grammar + // is defined for images in galleries + + // FIXME: Doing recursiveTagParse at this stage, and the trim before + // splitting on '|' is a bit odd, and different from makeImage. + $matches[3] = $this->recursiveTagParse( trim( $matches[3] ) ); + // Protect LanguageConverter markup + $parameterMatches = StringUtils::delimiterExplode( + '-{', '}-', '|', $matches[3], true /* nested */ + ); + + foreach ( $parameterMatches as $parameterMatch ) { + list( $magicName, $match ) = $mwArray->matchVariableStartToEnd( $parameterMatch ); + if ( $magicName ) { + $paramName = $paramMap[$magicName]; + + switch ( $paramName ) { + case 'gallery-internal-alt': + $alt = $this->stripAltText( $match, false ); + break; + case 'gallery-internal-link': + $linkValue = strip_tags( $this->replaceLinkHoldersText( $match ) ); + $chars = self::EXT_LINK_URL_CLASS; + $addr = self::EXT_LINK_ADDR; + $prots = $this->mUrlProtocols; + // check to see if link matches an absolute url, if not then it must be a wiki link. + if ( preg_match( '/^-{R|(.*)}-$/', $linkValue ) ) { + // Result of LanguageConverter::markNoConversion + // invoked on an external link. + $linkValue = substr( $linkValue, 4, -2 ); + } + if ( preg_match( "/^($prots)$addr$chars*$/u", $linkValue ) ) { + $link = $linkValue; + $this->mOutput->addExternalLink( $link ); + } else { + $localLinkTitle = Title::newFromText( $linkValue ); + if ( $localLinkTitle !== null ) { + $this->mOutput->addLink( $localLinkTitle ); + $link = $localLinkTitle->getLinkURL(); + } + } + break; + default: + // Must be a handler specific parameter. + if ( $handler->validateParam( $paramName, $match ) ) { + $handlerOptions[$paramName] = $match; + } else { + // Guess not, consider it as caption. + wfDebug( "$parameterMatch failed parameter validation\n" ); + $label = '|' . $parameterMatch; + } + } + + } else { + // Last pipe wins. + $label = '|' . $parameterMatch; + } + } + // Remove the pipe. + $label = substr( $label, 1 ); + } + + $ig->add( $title, $label, $alt, $link, $handlerOptions ); + } + $html = $ig->toHTML(); + Hooks::run( 'AfterParserFetchFileAndTitle', [ $this, $ig, &$html ] ); + return $html; + } + + /** + * @param MediaHandler $handler + * @return array + */ + public function getImageParams( $handler ) { + if ( $handler ) { + $handlerClass = get_class( $handler ); + } else { + $handlerClass = ''; + } + if ( !isset( $this->mImageParams[$handlerClass] ) ) { + # Initialise static lists + static $internalParamNames = [ + 'horizAlign' => [ 'left', 'right', 'center', 'none' ], + 'vertAlign' => [ 'baseline', 'sub', 'super', 'top', 'text-top', 'middle', + 'bottom', 'text-bottom' ], + 'frame' => [ 'thumbnail', 'manualthumb', 'framed', 'frameless', + 'upright', 'border', 'link', 'alt', 'class' ], + ]; + static $internalParamMap; + if ( !$internalParamMap ) { + $internalParamMap = []; + foreach ( $internalParamNames as $type => $names ) { + foreach ( $names as $name ) { + // For grep: img_left, img_right, img_center, img_none, + // img_baseline, img_sub, img_super, img_top, img_text_top, img_middle, + // img_bottom, img_text_bottom, + // img_thumbnail, img_manualthumb, img_framed, img_frameless, img_upright, + // img_border, img_link, img_alt, img_class + $magicName = str_replace( '-', '_', "img_$name" ); + $internalParamMap[$magicName] = [ $type, $name ]; + } + } + } + + # Add handler params + $paramMap = $internalParamMap; + if ( $handler ) { + $handlerParamMap = $handler->getParamMap(); + foreach ( $handlerParamMap as $magic => $paramName ) { + $paramMap[$magic] = [ 'handler', $paramName ]; + } + } + $this->mImageParams[$handlerClass] = $paramMap; + $this->mImageParamsMagicArray[$handlerClass] = new MagicWordArray( array_keys( $paramMap ) ); + } + return [ $this->mImageParams[$handlerClass], $this->mImageParamsMagicArray[$handlerClass] ]; + } + + /** + * Parse image options text and use it to make an image + * + * @param Title $title + * @param string $options + * @param LinkHolderArray|bool $holders + * @return string HTML + */ + public function makeImage( $title, $options, $holders = false ) { + # Check if the options text is of the form "options|alt text" + # Options are: + # * thumbnail make a thumbnail with enlarge-icon and caption, alignment depends on lang + # * left no resizing, just left align. label is used for alt= only + # * right same, but right aligned + # * none same, but not aligned + # * ___px scale to ___ pixels width, no aligning. e.g. use in taxobox + # * center center the image + # * frame Keep original image size, no magnify-button. + # * framed Same as "frame" + # * frameless like 'thumb' but without a frame. Keeps user preferences for width + # * upright reduce width for upright images, rounded to full __0 px + # * border draw a 1px border around the image + # * alt Text for HTML alt attribute (defaults to empty) + # * class Set a class for img node + # * link Set the target of the image link. Can be external, interwiki, or local + # vertical-align values (no % or length right now): + # * baseline + # * sub + # * super + # * top + # * text-top + # * middle + # * bottom + # * text-bottom + + # Protect LanguageConverter markup when splitting into parts + $parts = StringUtils::delimiterExplode( + '-{', '}-', '|', $options, true /* allow nesting */ + ); + + # Give extensions a chance to select the file revision for us + $options = []; + $descQuery = false; + Hooks::run( 'BeforeParserFetchFileAndTitle', + [ $this, $title, &$options, &$descQuery ] ); + # Fetch and register the file (file title may be different via hooks) + list( $file, $title ) = $this->fetchFileAndTitle( $title, $options ); + + # Get parameter map + $handler = $file ? $file->getHandler() : false; + + list( $paramMap, $mwArray ) = $this->getImageParams( $handler ); + + if ( !$file ) { + $this->addTrackingCategory( 'broken-file-category' ); + } + + # Process the input parameters + $caption = ''; + $params = [ 'frame' => [], 'handler' => [], + 'horizAlign' => [], 'vertAlign' => [] ]; + $seenformat = false; + foreach ( $parts as $part ) { + $part = trim( $part ); + list( $magicName, $value ) = $mwArray->matchVariableStartToEnd( $part ); + $validated = false; + if ( isset( $paramMap[$magicName] ) ) { + list( $type, $paramName ) = $paramMap[$magicName]; + + # Special case; width and height come in one variable together + if ( $type === 'handler' && $paramName === 'width' ) { + $parsedWidthParam = self::parseWidthParam( $value ); + if ( isset( $parsedWidthParam['width'] ) ) { + $width = $parsedWidthParam['width']; + if ( $handler->validateParam( 'width', $width ) ) { + $params[$type]['width'] = $width; + $validated = true; + } + } + if ( isset( $parsedWidthParam['height'] ) ) { + $height = $parsedWidthParam['height']; + if ( $handler->validateParam( 'height', $height ) ) { + $params[$type]['height'] = $height; + $validated = true; + } + } + # else no validation -- T15436 + } else { + if ( $type === 'handler' ) { + # Validate handler parameter + $validated = $handler->validateParam( $paramName, $value ); + } else { + # Validate internal parameters + switch ( $paramName ) { + case 'manualthumb': + case 'alt': + case 'class': + # @todo FIXME: Possibly check validity here for + # manualthumb? downstream behavior seems odd with + # missing manual thumbs. + $validated = true; + $value = $this->stripAltText( $value, $holders ); + break; + case 'link': + $chars = self::EXT_LINK_URL_CLASS; + $addr = self::EXT_LINK_ADDR; + $prots = $this->mUrlProtocols; + if ( $value === '' ) { + $paramName = 'no-link'; + $value = true; + $validated = true; + } elseif ( preg_match( "/^((?i)$prots)/", $value ) ) { + if ( preg_match( "/^((?i)$prots)$addr$chars*$/u", $value, $m ) ) { + $paramName = 'link-url'; + $this->mOutput->addExternalLink( $value ); + if ( $this->mOptions->getExternalLinkTarget() ) { + $params[$type]['link-target'] = $this->mOptions->getExternalLinkTarget(); + } + $validated = true; + } + } else { + $linkTitle = Title::newFromText( $value ); + if ( $linkTitle ) { + $paramName = 'link-title'; + $value = $linkTitle; + $this->mOutput->addLink( $linkTitle ); + $validated = true; + } + } + break; + case 'frameless': + case 'framed': + case 'thumbnail': + // use first appearing option, discard others. + $validated = !$seenformat; + $seenformat = true; + break; + default: + # Most other things appear to be empty or numeric... + $validated = ( $value === false || is_numeric( trim( $value ) ) ); + } + } + + if ( $validated ) { + $params[$type][$paramName] = $value; + } + } + } + if ( !$validated ) { + $caption = $part; + } + } + + # Process alignment parameters + if ( $params['horizAlign'] ) { + $params['frame']['align'] = key( $params['horizAlign'] ); + } + if ( $params['vertAlign'] ) { + $params['frame']['valign'] = key( $params['vertAlign'] ); + } + + $params['frame']['caption'] = $caption; + + # Will the image be presented in a frame, with the caption below? + $imageIsFramed = isset( $params['frame']['frame'] ) + || isset( $params['frame']['framed'] ) + || isset( $params['frame']['thumbnail'] ) + || isset( $params['frame']['manualthumb'] ); + + # In the old days, [[Image:Foo|text...]] would set alt text. Later it + # came to also set the caption, ordinary text after the image -- which + # makes no sense, because that just repeats the text multiple times in + # screen readers. It *also* came to set the title attribute. + # Now that we have an alt attribute, we should not set the alt text to + # equal the caption: that's worse than useless, it just repeats the + # text. This is the framed/thumbnail case. If there's no caption, we + # use the unnamed parameter for alt text as well, just for the time be- + # ing, if the unnamed param is set and the alt param is not. + # For the future, we need to figure out if we want to tweak this more, + # e.g., introducing a title= parameter for the title; ignoring the un- + # named parameter entirely for images without a caption; adding an ex- + # plicit caption= parameter and preserving the old magic unnamed para- + # meter for BC; ... + if ( $imageIsFramed ) { # Framed image + if ( $caption === '' && !isset( $params['frame']['alt'] ) ) { + # No caption or alt text, add the filename as the alt text so + # that screen readers at least get some description of the image + $params['frame']['alt'] = $title->getText(); + } + # Do not set $params['frame']['title'] because tooltips don't make sense + # for framed images + } else { # Inline image + if ( !isset( $params['frame']['alt'] ) ) { + # No alt text, use the "caption" for the alt text + if ( $caption !== '' ) { + $params['frame']['alt'] = $this->stripAltText( $caption, $holders ); + } else { + # No caption, fall back to using the filename for the + # alt text + $params['frame']['alt'] = $title->getText(); + } + } + # Use the "caption" for the tooltip text + $params['frame']['title'] = $this->stripAltText( $caption, $holders ); + } + + Hooks::run( 'ParserMakeImageParams', [ $title, $file, &$params, $this ] ); + + # Linker does the rest + $time = isset( $options['time'] ) ? $options['time'] : false; + $ret = Linker::makeImageLink( $this, $title, $file, $params['frame'], $params['handler'], + $time, $descQuery, $this->mOptions->getThumbSize() ); + + # Give the handler a chance to modify the parser object + if ( $handler ) { + $handler->parserTransformHook( $this, $file ); + } + + return $ret; + } + + /** + * @param string $caption + * @param LinkHolderArray|bool $holders + * @return mixed|string + */ + protected function stripAltText( $caption, $holders ) { + # Strip bad stuff out of the title (tooltip). We can't just use + # replaceLinkHoldersText() here, because if this function is called + # from replaceInternalLinks2(), mLinkHolders won't be up-to-date. + if ( $holders ) { + $tooltip = $holders->replaceText( $caption ); + } else { + $tooltip = $this->replaceLinkHoldersText( $caption ); + } + + # make sure there are no placeholders in thumbnail attributes + # that are later expanded to html- so expand them now and + # remove the tags + $tooltip = $this->mStripState->unstripBoth( $tooltip ); + $tooltip = Sanitizer::stripAllTags( $tooltip ); + + return $tooltip; + } + + /** + * Set a flag in the output object indicating that the content is dynamic and + * shouldn't be cached. + * @deprecated since 1.28; use getOutput()->updateCacheExpiry() + */ + public function disableCache() { + wfDebug( "Parser output marked as uncacheable.\n" ); + if ( !$this->mOutput ) { + throw new MWException( __METHOD__ . + " can only be called when actually parsing something" ); + } + $this->mOutput->updateCacheExpiry( 0 ); // new style, for consistency + } + + /** + * Callback from the Sanitizer for expanding items found in HTML attribute + * values, so they can be safely tested and escaped. + * + * @param string &$text + * @param bool|PPFrame $frame + * @return string + */ + public function attributeStripCallback( &$text, $frame = false ) { + $text = $this->replaceVariables( $text, $frame ); + $text = $this->mStripState->unstripBoth( $text ); + return $text; + } + + /** + * Accessor + * + * @return array + */ + public function getTags() { + return array_merge( + array_keys( $this->mTransparentTagHooks ), + array_keys( $this->mTagHooks ), + array_keys( $this->mFunctionTagHooks ) + ); + } + + /** + * Replace transparent tags in $text with the values given by the callbacks. + * + * Transparent tag hooks are like regular XML-style tag hooks, except they + * operate late in the transformation sequence, on HTML instead of wikitext. + * + * @param string $text + * + * @return string + */ + public function replaceTransparentTags( $text ) { + $matches = []; + $elements = array_keys( $this->mTransparentTagHooks ); + $text = self::extractTagsAndParams( $elements, $text, $matches ); + $replacements = []; + + foreach ( $matches as $marker => $data ) { + list( $element, $content, $params, $tag ) = $data; + $tagName = strtolower( $element ); + if ( isset( $this->mTransparentTagHooks[$tagName] ) ) { + $output = call_user_func_array( + $this->mTransparentTagHooks[$tagName], + [ $content, $params, $this ] + ); + } else { + $output = $tag; + } + $replacements[$marker] = $output; + } + return strtr( $text, $replacements ); + } + + /** + * Break wikitext input into sections, and either pull or replace + * some particular section's text. + * + * External callers should use the getSection and replaceSection methods. + * + * @param string $text Page wikitext + * @param string|int $sectionId A section identifier string of the form: + * "<flag1> - <flag2> - ... - <section number>" + * + * Currently the only recognised flag is "T", which means the target section number + * was derived during a template inclusion parse, in other words this is a template + * section edit link. If no flags are given, it was an ordinary section edit link. + * This flag is required to avoid a section numbering mismatch when a section is + * enclosed by "<includeonly>" (T8563). + * + * The section number 0 pulls the text before the first heading; other numbers will + * pull the given section along with its lower-level subsections. If the section is + * not found, $mode=get will return $newtext, and $mode=replace will return $text. + * + * Section 0 is always considered to exist, even if it only contains the empty + * string. If $text is the empty string and section 0 is replaced, $newText is + * returned. + * + * @param string $mode One of "get" or "replace" + * @param string $newText Replacement text for section data. + * @return string For "get", the extracted section text. + * for "replace", the whole page with the section replaced. + */ + private function extractSections( $text, $sectionId, $mode, $newText = '' ) { + global $wgTitle; # not generally used but removes an ugly failure mode + + $magicScopeVariable = $this->lock(); + $this->startParse( $wgTitle, new ParserOptions, self::OT_PLAIN, true ); + $outText = ''; + $frame = $this->getPreprocessor()->newFrame(); + + # Process section extraction flags + $flags = 0; + $sectionParts = explode( '-', $sectionId ); + $sectionIndex = array_pop( $sectionParts ); + foreach ( $sectionParts as $part ) { + if ( $part === 'T' ) { + $flags |= self::PTD_FOR_INCLUSION; + } + } + + # Check for empty input + if ( strval( $text ) === '' ) { + # Only sections 0 and T-0 exist in an empty document + if ( $sectionIndex == 0 ) { + if ( $mode === 'get' ) { + return ''; + } else { + return $newText; + } + } else { + if ( $mode === 'get' ) { + return $newText; + } else { + return $text; + } + } + } + + # Preprocess the text + $root = $this->preprocessToDom( $text, $flags ); + + # <h> nodes indicate section breaks + # They can only occur at the top level, so we can find them by iterating the root's children + $node = $root->getFirstChild(); + + # Find the target section + if ( $sectionIndex == 0 ) { + # Section zero doesn't nest, level=big + $targetLevel = 1000; + } else { + while ( $node ) { + if ( $node->getName() === 'h' ) { + $bits = $node->splitHeading(); + if ( $bits['i'] == $sectionIndex ) { + $targetLevel = $bits['level']; + break; + } + } + if ( $mode === 'replace' ) { + $outText .= $frame->expand( $node, PPFrame::RECOVER_ORIG ); + } + $node = $node->getNextSibling(); + } + } + + if ( !$node ) { + # Not found + if ( $mode === 'get' ) { + return $newText; + } else { + return $text; + } + } + + # Find the end of the section, including nested sections + do { + if ( $node->getName() === 'h' ) { + $bits = $node->splitHeading(); + $curLevel = $bits['level']; + if ( $bits['i'] != $sectionIndex && $curLevel <= $targetLevel ) { + break; + } + } + if ( $mode === 'get' ) { + $outText .= $frame->expand( $node, PPFrame::RECOVER_ORIG ); + } + $node = $node->getNextSibling(); + } while ( $node ); + + # Write out the remainder (in replace mode only) + if ( $mode === 'replace' ) { + # Output the replacement text + # Add two newlines on -- trailing whitespace in $newText is conventionally + # stripped by the editor, so we need both newlines to restore the paragraph gap + # Only add trailing whitespace if there is newText + if ( $newText != "" ) { + $outText .= $newText . "\n\n"; + } + + while ( $node ) { + $outText .= $frame->expand( $node, PPFrame::RECOVER_ORIG ); + $node = $node->getNextSibling(); + } + } + + if ( is_string( $outText ) ) { + # Re-insert stripped tags + $outText = rtrim( $this->mStripState->unstripBoth( $outText ) ); + } + + return $outText; + } + + /** + * This function returns the text of a section, specified by a number ($section). + * A section is text under a heading like == Heading == or \<h1\>Heading\</h1\>, or + * the first section before any such heading (section 0). + * + * If a section contains subsections, these are also returned. + * + * @param string $text Text to look in + * @param string|int $sectionId Section identifier as a number or string + * (e.g. 0, 1 or 'T-1'). + * @param string $defaultText Default to return if section is not found + * + * @return string Text of the requested section + */ + public function getSection( $text, $sectionId, $defaultText = '' ) { + return $this->extractSections( $text, $sectionId, 'get', $defaultText ); + } + + /** + * This function returns $oldtext after the content of the section + * specified by $section has been replaced with $text. If the target + * section does not exist, $oldtext is returned unchanged. + * + * @param string $oldText Former text of the article + * @param string|int $sectionId Section identifier as a number or string + * (e.g. 0, 1 or 'T-1'). + * @param string $newText Replacing text + * + * @return string Modified text + */ + public function replaceSection( $oldText, $sectionId, $newText ) { + return $this->extractSections( $oldText, $sectionId, 'replace', $newText ); + } + + /** + * Get the ID of the revision we are parsing + * + * @return int|null + */ + public function getRevisionId() { + return $this->mRevisionId; + } + + /** + * Get the revision object for $this->mRevisionId + * + * @return Revision|null Either a Revision object or null + * @since 1.23 (public since 1.23) + */ + public function getRevisionObject() { + if ( !is_null( $this->mRevisionObject ) ) { + return $this->mRevisionObject; + } + if ( is_null( $this->mRevisionId ) ) { + return null; + } + + $rev = call_user_func( + $this->mOptions->getCurrentRevisionCallback(), $this->getTitle(), $this + ); + + # If the parse is for a new revision, then the callback should have + # already been set to force the object and should match mRevisionId. + # If not, try to fetch by mRevisionId for sanity. + if ( $rev && $rev->getId() != $this->mRevisionId ) { + $rev = Revision::newFromId( $this->mRevisionId ); + } + + $this->mRevisionObject = $rev; + + return $this->mRevisionObject; + } + + /** + * Get the timestamp associated with the current revision, adjusted for + * the default server-local timestamp + * @return string + */ + public function getRevisionTimestamp() { + if ( is_null( $this->mRevisionTimestamp ) ) { + global $wgContLang; + + $revObject = $this->getRevisionObject(); + $timestamp = $revObject ? $revObject->getTimestamp() : wfTimestampNow(); + + # The cryptic '' timezone parameter tells to use the site-default + # timezone offset instead of the user settings. + # Since this value will be saved into the parser cache, served + # to other users, and potentially even used inside links and such, + # it needs to be consistent for all visitors. + $this->mRevisionTimestamp = $wgContLang->userAdjust( $timestamp, '' ); + + } + return $this->mRevisionTimestamp; + } + + /** + * Get the name of the user that edited the last revision + * + * @return string User name + */ + public function getRevisionUser() { + if ( is_null( $this->mRevisionUser ) ) { + $revObject = $this->getRevisionObject(); + + # if this template is subst: the revision id will be blank, + # so just use the current user's name + if ( $revObject ) { + $this->mRevisionUser = $revObject->getUserText(); + } elseif ( $this->ot['wiki'] || $this->mOptions->getIsPreview() ) { + $this->mRevisionUser = $this->getUser()->getName(); + } + } + return $this->mRevisionUser; + } + + /** + * Get the size of the revision + * + * @return int|null Revision size + */ + public function getRevisionSize() { + if ( is_null( $this->mRevisionSize ) ) { + $revObject = $this->getRevisionObject(); + + # if this variable is subst: the revision id will be blank, + # so just use the parser input size, because the own substituation + # will change the size. + if ( $revObject ) { + $this->mRevisionSize = $revObject->getSize(); + } else { + $this->mRevisionSize = $this->mInputSize; + } + } + return $this->mRevisionSize; + } + + /** + * Mutator for $mDefaultSort + * + * @param string $sort New value + */ + public function setDefaultSort( $sort ) { + $this->mDefaultSort = $sort; + $this->mOutput->setProperty( 'defaultsort', $sort ); + } + + /** + * Accessor for $mDefaultSort + * Will use the empty string if none is set. + * + * This value is treated as a prefix, so the + * empty string is equivalent to sorting by + * page name. + * + * @return string + */ + public function getDefaultSort() { + if ( $this->mDefaultSort !== false ) { + return $this->mDefaultSort; + } else { + return ''; + } + } + + /** + * Accessor for $mDefaultSort + * Unlike getDefaultSort(), will return false if none is set + * + * @return string|bool + */ + public function getCustomDefaultSort() { + return $this->mDefaultSort; + } + + private static function getSectionNameFromStrippedText( $text ) { + $text = Sanitizer::normalizeSectionNameWhitespace( $text ); + $text = Sanitizer::decodeCharReferences( $text ); + $text = self::normalizeSectionName( $text ); + return $text; + } + + private static function makeAnchor( $sectionName ) { + return '#' . Sanitizer::escapeIdForLink( $sectionName ); + } + + private static function makeLegacyAnchor( $sectionName ) { + global $wgFragmentMode; + if ( isset( $wgFragmentMode[1] ) && $wgFragmentMode[1] === 'legacy' ) { + // ForAttribute() and ForLink() are the same for legacy encoding + $id = Sanitizer::escapeIdForAttribute( $sectionName, Sanitizer::ID_FALLBACK ); + } else { + $id = Sanitizer::escapeIdForLink( $sectionName ); + } + + return "#$id"; + } + + /** + * Try to guess the section anchor name based on a wikitext fragment + * presumably extracted from a heading, for example "Header" from + * "== Header ==". + * + * @param string $text + * @return string Anchor (starting with '#') + */ + public function guessSectionNameFromWikiText( $text ) { + # Strip out wikitext links(they break the anchor) + $text = $this->stripSectionName( $text ); + $sectionName = self::getSectionNameFromStrippedText( $text ); + return self::makeAnchor( $sectionName ); + } + + /** + * Same as guessSectionNameFromWikiText(), but produces legacy anchors + * instead, if possible. For use in redirects, since various versions + * of Microsoft browsers interpret Location: headers as something other + * than UTF-8, resulting in breakage. + * + * @param string $text The section name + * @return string Anchor (starting with '#') + */ + public function guessLegacySectionNameFromWikiText( $text ) { + # Strip out wikitext links(they break the anchor) + $text = $this->stripSectionName( $text ); + $sectionName = self::getSectionNameFromStrippedText( $text ); + return self::makeLegacyAnchor( $sectionName ); + } + + /** + * Like guessSectionNameFromWikiText(), but takes already-stripped text as input. + * @param string $text Section name (plain text) + * @return string Anchor (starting with '#') + */ + public static function guessSectionNameFromStrippedText( $text ) { + $sectionName = self::getSectionNameFromStrippedText( $text ); + return self::makeAnchor( $sectionName ); + } + + /** + * Apply the same normalization as code making links to this section would + * + * @param string $text + * @return string + */ + private static function normalizeSectionName( $text ) { + # T90902: ensure the same normalization is applied for IDs as to links + $titleParser = MediaWikiServices::getInstance()->getTitleParser(); + try { + + $parts = $titleParser->splitTitleString( "#$text" ); + } catch ( MalformedTitleException $ex ) { + return $text; + } + return $parts['fragment']; + } + + /** + * Strips a text string of wikitext for use in a section anchor + * + * Accepts a text string and then removes all wikitext from the + * string and leaves only the resultant text (i.e. the result of + * [[User:WikiSysop|Sysop]] would be "Sysop" and the result of + * [[User:WikiSysop]] would be "User:WikiSysop") - this is intended + * to create valid section anchors by mimicing the output of the + * parser when headings are parsed. + * + * @param string $text Text string to be stripped of wikitext + * for use in a Section anchor + * @return string Filtered text string + */ + public function stripSectionName( $text ) { + # Strip internal link markup + $text = preg_replace( '/\[\[:?([^[|]+)\|([^[]+)\]\]/', '$2', $text ); + $text = preg_replace( '/\[\[:?([^[]+)\|?\]\]/', '$1', $text ); + + # Strip external link markup + # @todo FIXME: Not tolerant to blank link text + # I.E. [https://www.mediawiki.org] will render as [1] or something depending + # on how many empty links there are on the page - need to figure that out. + $text = preg_replace( '/\[(?i:' . $this->mUrlProtocols . ')([^ ]+?) ([^[]+)\]/', '$2', $text ); + + # Parse wikitext quotes (italics & bold) + $text = $this->doQuotes( $text ); + + # Strip HTML tags + $text = StringUtils::delimiterReplace( '<', '>', '', $text ); + return $text; + } + + /** + * strip/replaceVariables/unstrip for preprocessor regression testing + * + * @param string $text + * @param Title $title + * @param ParserOptions $options + * @param int $outputType + * + * @return string + */ + public function testSrvus( $text, Title $title, ParserOptions $options, + $outputType = self::OT_HTML + ) { + $magicScopeVariable = $this->lock(); + $this->startParse( $title, $options, $outputType, true ); + + $text = $this->replaceVariables( $text ); + $text = $this->mStripState->unstripBoth( $text ); + $text = Sanitizer::removeHTMLtags( $text ); + return $text; + } + + /** + * @param string $text + * @param Title $title + * @param ParserOptions $options + * @return string + */ + public function testPst( $text, Title $title, ParserOptions $options ) { + return $this->preSaveTransform( $text, $title, $options->getUser(), $options ); + } + + /** + * @param string $text + * @param Title $title + * @param ParserOptions $options + * @return string + */ + public function testPreprocess( $text, Title $title, ParserOptions $options ) { + return $this->testSrvus( $text, $title, $options, self::OT_PREPROCESS ); + } + + /** + * Call a callback function on all regions of the given text that are not + * inside strip markers, and replace those regions with the return value + * of the callback. For example, with input: + * + * aaa<MARKER>bbb + * + * This will call the callback function twice, with 'aaa' and 'bbb'. Those + * two strings will be replaced with the value returned by the callback in + * each case. + * + * @param string $s + * @param callable $callback + * + * @return string + */ + public function markerSkipCallback( $s, $callback ) { + $i = 0; + $out = ''; + while ( $i < strlen( $s ) ) { + $markerStart = strpos( $s, self::MARKER_PREFIX, $i ); + if ( $markerStart === false ) { + $out .= call_user_func( $callback, substr( $s, $i ) ); + break; + } else { + $out .= call_user_func( $callback, substr( $s, $i, $markerStart - $i ) ); + $markerEnd = strpos( $s, self::MARKER_SUFFIX, $markerStart ); + if ( $markerEnd === false ) { + $out .= substr( $s, $markerStart ); + break; + } else { + $markerEnd += strlen( self::MARKER_SUFFIX ); + $out .= substr( $s, $markerStart, $markerEnd - $markerStart ); + $i = $markerEnd; + } + } + } + return $out; + } + + /** + * Remove any strip markers found in the given text. + * + * @param string $text + * @return string + */ + public function killMarkers( $text ) { + return $this->mStripState->killMarkers( $text ); + } + + /** + * Save the parser state required to convert the given half-parsed text to + * HTML. "Half-parsed" in this context means the output of + * recursiveTagParse() or internalParse(). This output has strip markers + * from replaceVariables (extensionSubstitution() etc.), and link + * placeholders from replaceLinkHolders(). + * + * Returns an array which can be serialized and stored persistently. This + * array can later be loaded into another parser instance with + * unserializeHalfParsedText(). The text can then be safely incorporated into + * the return value of a parser hook. + * + * @deprecated since 1.31 + * @param string $text + * + * @return array + */ + public function serializeHalfParsedText( $text ) { + wfDeprecated( __METHOD__, '1.31' ); + $data = [ + 'text' => $text, + 'version' => self::HALF_PARSED_VERSION, + 'stripState' => $this->mStripState->getSubState( $text ), + 'linkHolders' => $this->mLinkHolders->getSubArray( $text ) + ]; + return $data; + } + + /** + * Load the parser state given in the $data array, which is assumed to + * have been generated by serializeHalfParsedText(). The text contents is + * extracted from the array, and its markers are transformed into markers + * appropriate for the current Parser instance. This transformed text is + * returned, and can be safely included in the return value of a parser + * hook. + * + * If the $data array has been stored persistently, the caller should first + * check whether it is still valid, by calling isValidHalfParsedText(). + * + * @deprecated since 1.31 + * @param array $data Serialized data + * @throws MWException + * @return string + */ + public function unserializeHalfParsedText( $data ) { + wfDeprecated( __METHOD__, '1.31' ); + if ( !isset( $data['version'] ) || $data['version'] != self::HALF_PARSED_VERSION ) { + throw new MWException( __METHOD__ . ': invalid version' ); + } + + # First, extract the strip state. + $texts = [ $data['text'] ]; + $texts = $this->mStripState->merge( $data['stripState'], $texts ); + + # Now renumber links + $texts = $this->mLinkHolders->mergeForeign( $data['linkHolders'], $texts ); + + # Should be good to go. + return $texts[0]; + } + + /** + * Returns true if the given array, presumed to be generated by + * serializeHalfParsedText(), is compatible with the current version of the + * parser. + * + * @deprecated since 1.31 + * @param array $data + * + * @return bool + */ + public function isValidHalfParsedText( $data ) { + wfDeprecated( __METHOD__, '1.31' ); + return isset( $data['version'] ) && $data['version'] == self::HALF_PARSED_VERSION; + } + + /** + * Parsed a width param of imagelink like 300px or 200x300px + * + * @param string $value + * @param bool $parseHeight + * + * @return array + * @since 1.20 + */ + public static function parseWidthParam( $value, $parseHeight = true ) { + $parsedWidthParam = []; + if ( $value === '' ) { + return $parsedWidthParam; + } + $m = []; + # (T15500) In both cases (width/height and width only), + # permit trailing "px" for backward compatibility. + if ( $parseHeight && preg_match( '/^([0-9]*)x([0-9]*)\s*(?:px)?\s*$/', $value, $m ) ) { + $width = intval( $m[1] ); + $height = intval( $m[2] ); + $parsedWidthParam['width'] = $width; + $parsedWidthParam['height'] = $height; + } elseif ( preg_match( '/^[0-9]*\s*(?:px)?\s*$/', $value ) ) { + $width = intval( $value ); + $parsedWidthParam['width'] = $width; + } + return $parsedWidthParam; + } + + /** + * Lock the current instance of the parser. + * + * This is meant to stop someone from calling the parser + * recursively and messing up all the strip state. + * + * @throws MWException If parser is in a parse + * @return ScopedCallback The lock will be released once the return value goes out of scope. + */ + protected function lock() { + if ( $this->mInParse ) { + throw new MWException( "Parser state cleared while parsing. " + . "Did you call Parser::parse recursively? Lock is held by: " . $this->mInParse ); + } + + // Save the backtrace when locking, so that if some code tries locking again, + // we can print the lock owner's backtrace for easier debugging + $e = new Exception; + $this->mInParse = $e->getTraceAsString(); + + $recursiveCheck = new ScopedCallback( function () { + $this->mInParse = false; + } ); + + return $recursiveCheck; + } + + /** + * Strip outer <p></p> tag from the HTML source of a single paragraph. + * + * Returns original HTML if the <p/> tag has any attributes, if there's no wrapping <p/> tag, + * or if there is more than one <p/> tag in the input HTML. + * + * @param string $html + * @return string + * @since 1.24 + */ + public static function stripOuterParagraph( $html ) { + $m = []; + if ( preg_match( '/^<p>(.*)\n?<\/p>\n?$/sU', $html, $m ) ) { + if ( strpos( $m[1], '</p>' ) === false ) { + $html = $m[1]; + } + } + + return $html; + } + + /** + * Return this parser if it is not doing anything, otherwise + * get a fresh parser. You can use this method by doing + * $myParser = $wgParser->getFreshParser(), or more simply + * $wgParser->getFreshParser()->parse( ... ); + * if you're unsure if $wgParser is safe to use. + * + * @since 1.24 + * @return Parser A parser object that is not parsing anything + */ + public function getFreshParser() { + global $wgParserConf; + if ( $this->mInParse ) { + return new $wgParserConf['class']( $wgParserConf ); + } else { + return $this; + } + } + + /** + * Set's up the PHP implementation of OOUI for use in this request + * and instructs OutputPage to enable OOUI for itself. + * + * @since 1.26 + */ + public function enableOOUI() { + OutputPage::setupOOUI(); + $this->mOutput->setEnableOOUI( true ); + } +} diff --git a/www/wiki/includes/parser/ParserCache.php b/www/wiki/includes/parser/ParserCache.php new file mode 100644 index 00000000..8a7fca6c --- /dev/null +++ b/www/wiki/includes/parser/ParserCache.php @@ -0,0 +1,354 @@ +<?php +/** + * Cache for outputs of the PHP parser + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Cache Parser + */ + +use MediaWiki\MediaWikiServices; + +/** + * @ingroup Cache Parser + * @todo document + */ +class ParserCache { + /** + * Constants for self::getKey() + * @since 1.30 + */ + + /** Use only current data */ + const USE_CURRENT_ONLY = 0; + + /** Use expired data if current data is unavailable */ + const USE_EXPIRED = 1; + + /** Use expired data or data from different revisions if current data is unavailable */ + const USE_OUTDATED = 2; + + /** + * Use expired data and data from different revisions, and if all else + * fails vary on all variable options + */ + const USE_ANYTHING = 3; + + /** @var BagOStuff */ + private $mMemc; + + /** + * Anything cached prior to this is invalidated + * + * @var string + */ + private $cacheEpoch; + /** + * Get an instance of this object + * + * @deprecated since 1.30, use MediaWikiServices instead + * @return ParserCache + */ + public static function singleton() { + return MediaWikiServices::getInstance()->getParserCache(); + } + + /** + * Setup a cache pathway with a given back-end storage mechanism. + * + * This class use an invalidation strategy that is compatible with + * MultiWriteBagOStuff in async replication mode. + * + * @param BagOStuff $cache + * @param string $cacheEpoch Anything before this timestamp is invalidated + * @throws MWException + */ + public function __construct( BagOStuff $cache, $cacheEpoch = '20030516000000' ) { + $this->mMemc = $cache; + $this->cacheEpoch = $cacheEpoch; + } + + /** + * @param WikiPage $article + * @param string $hash + * @return mixed|string + */ + protected function getParserOutputKey( $article, $hash ) { + global $wgRequest; + + // idhash seem to mean 'page id' + 'rendering hash' (r3710) + $pageid = $article->getId(); + $renderkey = (int)( $wgRequest->getVal( 'action' ) == 'render' ); + + $key = $this->mMemc->makeKey( 'pcache', 'idhash', "{$pageid}-{$renderkey}!{$hash}" ); + return $key; + } + + /** + * @param WikiPage $page + * @return mixed|string + */ + protected function getOptionsKey( $page ) { + return $this->mMemc->makeKey( 'pcache', 'idoptions', $page->getId() ); + } + + /** + * @param WikiPage $page + * @since 1.28 + */ + public function deleteOptionsKey( $page ) { + $this->mMemc->delete( $this->getOptionsKey( $page ) ); + } + + /** + * Provides an E-Tag suitable for the whole page. Note that $article + * is just the main wikitext. The E-Tag has to be unique to the whole + * page, even if the article itself is the same, so it uses the + * complete set of user options. We don't want to use the preference + * of a different user on a message just because it wasn't used in + * $article. For example give a Chinese interface to a user with + * English preferences. That's why we take into account *all* user + * options. (r70809 CR) + * + * @param WikiPage $article + * @param ParserOptions $popts + * @return string + */ + public function getETag( $article, $popts ) { + return 'W/"' . $this->getParserOutputKey( $article, + $popts->optionsHash( ParserOptions::allCacheVaryingOptions(), $article->getTitle() ) ) . + "--" . $article->getTouched() . '"'; + } + + /** + * Retrieve the ParserOutput from ParserCache, even if it's outdated. + * @param WikiPage $article + * @param ParserOptions $popts + * @return ParserOutput|bool False on failure + */ + public function getDirty( $article, $popts ) { + $value = $this->get( $article, $popts, true ); + return is_object( $value ) ? $value : false; + } + + /** + * Generates a key for caching the given article considering + * the given parser options. + * + * @note Which parser options influence the cache key + * is controlled via ParserOutput::recordOption() or + * ParserOptions::addExtraKey(). + * + * @note Used by Article to provide a unique id for the PoolCounter. + * It would be preferable to have this code in get() + * instead of having Article looking in our internals. + * + * @param WikiPage $article + * @param ParserOptions $popts + * @param int|bool $useOutdated One of the USE constants. For backwards + * compatibility, boolean false is treated as USE_CURRENT_ONLY and + * boolean true is treated as USE_ANYTHING. + * @return bool|mixed|string + * @since 1.30 Changed $useOutdated to an int and added the non-boolean values + */ + public function getKey( $article, $popts, $useOutdated = self::USE_ANYTHING ) { + if ( is_bool( $useOutdated ) ) { + $useOutdated = $useOutdated ? self::USE_ANYTHING : self::USE_CURRENT_ONLY; + } + + if ( $popts instanceof User ) { + wfWarn( "Use of outdated prototype ParserCache::getKey( &\$article, &\$user )\n" ); + $popts = ParserOptions::newFromUser( $popts ); + } + + // Determine the options which affect this article + $casToken = null; + $optionsKey = $this->mMemc->get( + $this->getOptionsKey( $article ), $casToken, BagOStuff::READ_VERIFIED ); + if ( $optionsKey instanceof CacheTime ) { + if ( $useOutdated < self::USE_EXPIRED && $optionsKey->expired( $article->getTouched() ) ) { + wfIncrStats( "pcache.miss.expired" ); + $cacheTime = $optionsKey->getCacheTime(); + wfDebugLog( "ParserCache", + "Parser options key expired, touched " . $article->getTouched() + . ", epoch {$this->cacheEpoch}, cached $cacheTime\n" ); + return false; + } elseif ( $useOutdated < self::USE_OUTDATED && + $optionsKey->isDifferentRevision( $article->getLatest() ) + ) { + wfIncrStats( "pcache.miss.revid" ); + $revId = $article->getLatest(); + $cachedRevId = $optionsKey->getCacheRevisionId(); + wfDebugLog( "ParserCache", + "ParserOutput key is for an old revision, latest $revId, cached $cachedRevId\n" + ); + return false; + } + + // $optionsKey->mUsedOptions is set by save() by calling ParserOutput::getUsedOptions() + $usedOptions = $optionsKey->mUsedOptions; + wfDebug( "Parser cache options found.\n" ); + } else { + if ( $useOutdated < self::USE_ANYTHING ) { + return false; + } + $usedOptions = ParserOptions::allCacheVaryingOptions(); + } + + return $this->getParserOutputKey( + $article, + $popts->optionsHash( $usedOptions, $article->getTitle() ) + ); + } + + /** + * Retrieve the ParserOutput from ParserCache. + * false if not found or outdated. + * + * @param WikiPage|Article $article + * @param ParserOptions $popts + * @param bool $useOutdated (default false) + * + * @return ParserOutput|bool False on failure + */ + public function get( $article, $popts, $useOutdated = false ) { + $canCache = $article->checkTouched(); + if ( !$canCache ) { + // It's a redirect now + return false; + } + + $touched = $article->getTouched(); + + $parserOutputKey = $this->getKey( $article, $popts, + $useOutdated ? self::USE_OUTDATED : self::USE_CURRENT_ONLY + ); + if ( $parserOutputKey === false ) { + wfIncrStats( 'pcache.miss.absent' ); + return false; + } + + $casToken = null; + /** @var ParserOutput $value */ + $value = $this->mMemc->get( $parserOutputKey, $casToken, BagOStuff::READ_VERIFIED ); + if ( !$value ) { + wfDebug( "ParserOutput cache miss.\n" ); + wfIncrStats( "pcache.miss.absent" ); + return false; + } + + wfDebug( "ParserOutput cache found.\n" ); + + $wikiPage = method_exists( $article, 'getPage' ) + ? $article->getPage() + : $article; + + if ( !$useOutdated && $value->expired( $touched ) ) { + wfIncrStats( "pcache.miss.expired" ); + $cacheTime = $value->getCacheTime(); + wfDebugLog( "ParserCache", + "ParserOutput key expired, touched $touched, " + . "epoch {$this->cacheEpoch}, cached $cacheTime\n" ); + $value = false; + } elseif ( !$useOutdated && $value->isDifferentRevision( $article->getLatest() ) ) { + wfIncrStats( "pcache.miss.revid" ); + $revId = $article->getLatest(); + $cachedRevId = $value->getCacheRevisionId(); + wfDebugLog( "ParserCache", + "ParserOutput key is for an old revision, latest $revId, cached $cachedRevId\n" + ); + $value = false; + } elseif ( + Hooks::run( 'RejectParserCacheValue', [ $value, $wikiPage, $popts ] ) === false + ) { + wfIncrStats( 'pcache.miss.rejected' ); + wfDebugLog( "ParserCache", + "ParserOutput key valid, but rejected by RejectParserCacheValue hook handler.\n" + ); + $value = false; + } else { + wfIncrStats( "pcache.hit" ); + } + + return $value; + } + + /** + * @param ParserOutput $parserOutput + * @param WikiPage $page + * @param ParserOptions $popts + * @param string $cacheTime Time when the cache was generated + * @param int $revId Revision ID that was parsed + */ + public function save( $parserOutput, $page, $popts, $cacheTime = null, $revId = null ) { + $expire = $parserOutput->getCacheExpiry(); + if ( $expire > 0 && !$this->mMemc instanceof EmptyBagOStuff ) { + $cacheTime = $cacheTime ?: wfTimestampNow(); + if ( !$revId ) { + $revision = $page->getRevision(); + $revId = $revision ? $revision->getId() : null; + } + + $optionsKey = new CacheTime; + $optionsKey->mUsedOptions = $parserOutput->getUsedOptions(); + $optionsKey->updateCacheExpiry( $expire ); + + $optionsKey->setCacheTime( $cacheTime ); + $parserOutput->setCacheTime( $cacheTime ); + $optionsKey->setCacheRevisionId( $revId ); + $parserOutput->setCacheRevisionId( $revId ); + + $parserOutputKey = $this->getParserOutputKey( $page, + $popts->optionsHash( $optionsKey->mUsedOptions, $page->getTitle() ) ); + + // Save the timestamp so that we don't have to load the revision row on view + $parserOutput->setTimestamp( $page->getTimestamp() ); + + $msg = "Saved in parser cache with key $parserOutputKey" . + " and timestamp $cacheTime" . + " and revision id $revId" . + "\n"; + + $parserOutput->mText .= "\n<!-- $msg -->\n"; + wfDebug( $msg ); + + // Save the parser output + $this->mMemc->set( $parserOutputKey, $parserOutput, $expire ); + + // ...and its pointer + $this->mMemc->set( $this->getOptionsKey( $page ), $optionsKey, $expire ); + + Hooks::run( + 'ParserCacheSaveComplete', + [ $this, $parserOutput, $page->getTitle(), $popts, $revId ] + ); + } elseif ( $expire <= 0 ) { + wfDebug( "Parser output was marked as uncacheable and has not been saved.\n" ); + } + } + + /** + * Get the backend BagOStuff instance that + * powers the parser cache + * + * @since 1.30 + * @return BagOStuff + */ + public function getCacheStorage() { + return $this->mMemc; + } +} diff --git a/www/wiki/includes/parser/ParserDiffTest.php b/www/wiki/includes/parser/ParserDiffTest.php new file mode 100644 index 00000000..353825a8 --- /dev/null +++ b/www/wiki/includes/parser/ParserDiffTest.php @@ -0,0 +1,121 @@ +<?php +/** + * Fake parser that output the difference of two different parsers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ + +/** + * @ingroup Parser + */ +class ParserDiffTest { + public $parsers; + public $conf; + public $shortOutput = false; + + public function __construct( $conf ) { + if ( !isset( $conf['parsers'] ) ) { + throw new MWException( __METHOD__ . ': no parsers specified' ); + } + $this->conf = $conf; + } + + public function init() { + if ( !is_null( $this->parsers ) ) { + return; + } + + if ( isset( $this->conf['shortOutput'] ) ) { + $this->shortOutput = $this->conf['shortOutput']; + } + + foreach ( $this->conf['parsers'] as $i => $parserConf ) { + if ( !is_array( $parserConf ) ) { + $class = $parserConf; + $parserConf = [ 'class' => $parserConf ]; + } else { + $class = $parserConf['class']; + } + $this->parsers[$i] = new $class( $parserConf ); + } + } + + public function __call( $name, $args ) { + $this->init(); + $results = []; + $mismatch = false; + $lastResult = null; + $first = true; + foreach ( $this->parsers as $i => $parser ) { + $currentResult = call_user_func_array( [ &$this->parsers[$i], $name ], $args ); + if ( $first ) { + $first = false; + } else { + if ( is_object( $lastResult ) ) { + if ( $lastResult != $currentResult ) { + $mismatch = true; + } + } else { + if ( $lastResult !== $currentResult ) { + $mismatch = true; + } + } + } + $results[$i] = $currentResult; + $lastResult = $currentResult; + } + if ( $mismatch ) { + if ( count( $results ) == 2 ) { + $resultsList = []; + foreach ( $this->parsers as $i => $parser ) { + $resultsList[] = var_export( $results[$i], true ); + } + $diff = wfDiff( $resultsList[0], $resultsList[1] ); + } else { + $diff = '[too many parsers]'; + } + $msg = "ParserDiffTest: results mismatch on call to $name\n"; + if ( !$this->shortOutput ) { + $msg .= 'Arguments: ' . $this->formatArray( $args ) . "\n"; + } + $msg .= 'Results: ' . $this->formatArray( $results ) . "\n" . + "Diff: $diff\n"; + throw new MWException( $msg ); + } + return $lastResult; + } + + public function formatArray( $array ) { + if ( $this->shortOutput ) { + foreach ( $array as $key => $value ) { + if ( $value instanceof ParserOutput ) { + $array[$key] = "ParserOutput: {$value->getText()}"; + } + } + } + return var_export( $array, true ); + } + + public function setFunctionHook( $id, $callback, $flags = 0 ) { + $this->init(); + foreach ( $this->parsers as $parser ) { + $parser->setFunctionHook( $id, $callback, $flags ); + } + } +} diff --git a/www/wiki/includes/parser/ParserOptions.php b/www/wiki/includes/parser/ParserOptions.php new file mode 100644 index 00000000..ff21ef02 --- /dev/null +++ b/www/wiki/includes/parser/ParserOptions.php @@ -0,0 +1,1395 @@ +<?php +/** + * Options for the PHP parser + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ +use Wikimedia\ScopedCallback; + +/** + * @brief Set options of the Parser + * + * How to add an option in core: + * 1. Add it to one of the arrays in ParserOptions::setDefaults() + * 2. If necessary, add an entry to ParserOptions::$inCacheKey + * 3. Add a getter and setter in the section for that. + * + * How to add an option in an extension: + * 1. Use the 'ParserOptionsRegister' hook to register it. + * 2. Where necessary, use $popt->getOption() and $popt->setOption() + * to access it. + * + * @ingroup Parser + */ +class ParserOptions { + + /** + * Default values for all options that are relevant for caching. + * @see self::getDefaults() + * @var array|null + */ + private static $defaults = null; + + /** + * Lazy-loaded options + * @var callback[] + */ + private static $lazyOptions = [ + 'dateformat' => [ __CLASS__, 'initDateFormat' ], + ]; + + /** + * Specify options that are included in the cache key + * @var array + */ + private static $inCacheKey = [ + 'dateformat' => true, + 'numberheadings' => true, + 'thumbsize' => true, + 'stubthreshold' => true, + 'printable' => true, + 'userlang' => true, + ]; + + /** + * Current values for all options that are relevant for caching. + * @var array + */ + private $options; + + /** + * Timestamp used for {{CURRENTDAY}} etc. + * @var string|null + * @note Caching based on parse time is handled externally + */ + private $mTimestamp; + + /** + * Stored user object + * @var User + * @todo Track this for caching somehow without fragmenting the cache insanely + */ + private $mUser; + + /** + * Function to be called when an option is accessed. + * @var callable|null + * @note Used for collecting used options, does not affect caching + */ + private $onAccessCallback = null; + + /** + * If the page being parsed is a redirect, this should hold the redirect + * target. + * @var Title|null + * @todo Track this for caching somehow + */ + private $redirectTarget = null; + + /** + * Appended to the options hash + */ + private $mExtraKey = ''; + + /** + * @name Option accessors + * @{ + */ + + /** + * Fetch an option, generically + * @since 1.30 + * @param string $name Option name + * @return mixed + */ + public function getOption( $name ) { + if ( !array_key_exists( $name, $this->options ) ) { + throw new InvalidArgumentException( "Unknown parser option $name" ); + } + + if ( isset( self::$lazyOptions[$name] ) && $this->options[$name] === null ) { + $this->options[$name] = call_user_func( self::$lazyOptions[$name], $this, $name ); + } + if ( !empty( self::$inCacheKey[$name] ) ) { + $this->optionUsed( $name ); + } + return $this->options[$name]; + } + + /** + * Set an option, generically + * @since 1.30 + * @param string $name Option name + * @param mixed $value New value. Passing null will set null, unlike many + * of the existing accessors which ignore null for historical reasons. + * @return mixed Old value + */ + public function setOption( $name, $value ) { + if ( !array_key_exists( $name, $this->options ) ) { + throw new InvalidArgumentException( "Unknown parser option $name" ); + } + $old = $this->options[$name]; + $this->options[$name] = $value; + return $old; + } + + /** + * Legacy implementation + * @since 1.30 For implementing legacy setters only. Don't use this in new code. + * @deprecated since 1.30 + * @param string $name Option name + * @param mixed $value New value. Passing null does not set the value. + * @return mixed Old value + */ + protected function setOptionLegacy( $name, $value ) { + if ( !array_key_exists( $name, $this->options ) ) { + throw new InvalidArgumentException( "Unknown parser option $name" ); + } + return wfSetVar( $this->options[$name], $value ); + } + + /** + * Whether to extract interlanguage links + * + * When true, interlanguage links will be returned by + * ParserOutput::getLanguageLinks() instead of generating link HTML. + * + * @return bool + */ + public function getInterwikiMagic() { + return $this->getOption( 'interwikiMagic' ); + } + + /** + * Specify whether to extract interlanguage links + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setInterwikiMagic( $x ) { + return $this->setOptionLegacy( 'interwikiMagic', $x ); + } + + /** + * Allow all external images inline? + * @return bool + */ + public function getAllowExternalImages() { + return $this->getOption( 'allowExternalImages' ); + } + + /** + * Allow all external images inline? + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setAllowExternalImages( $x ) { + return $this->setOptionLegacy( 'allowExternalImages', $x ); + } + + /** + * External images to allow + * + * When self::getAllowExternalImages() is false + * + * @return string|string[] URLs to allow + */ + public function getAllowExternalImagesFrom() { + return $this->getOption( 'allowExternalImagesFrom' ); + } + + /** + * External images to allow + * + * When self::getAllowExternalImages() is false + * + * @param string|string[]|null $x New value (null is no change) + * @return string|string[] Old value + */ + public function setAllowExternalImagesFrom( $x ) { + return $this->setOptionLegacy( 'allowExternalImagesFrom', $x ); + } + + /** + * Use the on-wiki external image whitelist? + * @return bool + */ + public function getEnableImageWhitelist() { + return $this->getOption( 'enableImageWhitelist' ); + } + + /** + * Use the on-wiki external image whitelist? + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setEnableImageWhitelist( $x ) { + return $this->setOptionLegacy( 'enableImageWhitelist', $x ); + } + + /** + * Automatically number headings? + * @return bool + */ + public function getNumberHeadings() { + return $this->getOption( 'numberheadings' ); + } + + /** + * Automatically number headings? + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setNumberHeadings( $x ) { + return $this->setOptionLegacy( 'numberheadings', $x ); + } + + /** + * Allow inclusion of special pages? + * @return bool + */ + public function getAllowSpecialInclusion() { + return $this->getOption( 'allowSpecialInclusion' ); + } + + /** + * Allow inclusion of special pages? + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setAllowSpecialInclusion( $x ) { + return $this->setOptionLegacy( 'allowSpecialInclusion', $x ); + } + + /** + * Use tidy to cleanup output HTML? + * @return bool + */ + public function getTidy() { + return $this->getOption( 'tidy' ); + } + + /** + * Use tidy to cleanup output HTML? + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setTidy( $x ) { + return $this->setOptionLegacy( 'tidy', $x ); + } + + /** + * Parsing an interface message? + * @return bool + */ + public function getInterfaceMessage() { + return $this->getOption( 'interfaceMessage' ); + } + + /** + * Parsing an interface message? + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setInterfaceMessage( $x ) { + return $this->setOptionLegacy( 'interfaceMessage', $x ); + } + + /** + * Target language for the parse + * @return Language|null + */ + public function getTargetLanguage() { + return $this->getOption( 'targetLanguage' ); + } + + /** + * Target language for the parse + * @param Language|null $x New value + * @return Language|null Old value + */ + public function setTargetLanguage( $x ) { + return $this->setOption( 'targetLanguage', $x ); + } + + /** + * Maximum size of template expansions, in bytes + * @return int + */ + public function getMaxIncludeSize() { + return $this->getOption( 'maxIncludeSize' ); + } + + /** + * Maximum size of template expansions, in bytes + * @param int|null $x New value (null is no change) + * @return int Old value + */ + public function setMaxIncludeSize( $x ) { + return $this->setOptionLegacy( 'maxIncludeSize', $x ); + } + + /** + * Maximum number of nodes touched by PPFrame::expand() + * @return int + */ + public function getMaxPPNodeCount() { + return $this->getOption( 'maxPPNodeCount' ); + } + + /** + * Maximum number of nodes touched by PPFrame::expand() + * @param int|null $x New value (null is no change) + * @return int Old value + */ + public function setMaxPPNodeCount( $x ) { + return $this->setOptionLegacy( 'maxPPNodeCount', $x ); + } + + /** + * Maximum number of nodes generated by Preprocessor::preprocessToObj() + * @return int + */ + public function getMaxGeneratedPPNodeCount() { + return $this->getOption( 'maxGeneratedPPNodeCount' ); + } + + /** + * Maximum number of nodes generated by Preprocessor::preprocessToObj() + * @param int|null $x New value (null is no change) + * @return int + */ + public function setMaxGeneratedPPNodeCount( $x ) { + return $this->setOptionLegacy( 'maxGeneratedPPNodeCount', $x ); + } + + /** + * Maximum recursion depth in PPFrame::expand() + * @return int + */ + public function getMaxPPExpandDepth() { + return $this->getOption( 'maxPPExpandDepth' ); + } + + /** + * Maximum recursion depth for templates within templates + * @return int + */ + public function getMaxTemplateDepth() { + return $this->getOption( 'maxTemplateDepth' ); + } + + /** + * Maximum recursion depth for templates within templates + * @param int|null $x New value (null is no change) + * @return int Old value + */ + public function setMaxTemplateDepth( $x ) { + return $this->setOptionLegacy( 'maxTemplateDepth', $x ); + } + + /** + * Maximum number of calls per parse to expensive parser functions + * @since 1.20 + * @return int + */ + public function getExpensiveParserFunctionLimit() { + return $this->getOption( 'expensiveParserFunctionLimit' ); + } + + /** + * Maximum number of calls per parse to expensive parser functions + * @since 1.20 + * @param int|null $x New value (null is no change) + * @return int Old value + */ + public function setExpensiveParserFunctionLimit( $x ) { + return $this->setOptionLegacy( 'expensiveParserFunctionLimit', $x ); + } + + /** + * Remove HTML comments + * @warning Only applies to preprocess operations + * @return bool + */ + public function getRemoveComments() { + return $this->getOption( 'removeComments' ); + } + + /** + * Remove HTML comments + * @warning Only applies to preprocess operations + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setRemoveComments( $x ) { + return $this->setOptionLegacy( 'removeComments', $x ); + } + + /** + * Enable limit report in an HTML comment on output + * @return bool + */ + public function getEnableLimitReport() { + return $this->getOption( 'enableLimitReport' ); + } + + /** + * Enable limit report in an HTML comment on output + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function enableLimitReport( $x = true ) { + return $this->setOptionLegacy( 'enableLimitReport', $x ); + } + + /** + * Clean up signature texts? + * @see Parser::cleanSig + * @return bool + */ + public function getCleanSignatures() { + return $this->getOption( 'cleanSignatures' ); + } + + /** + * Clean up signature texts? + * @see Parser::cleanSig + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setCleanSignatures( $x ) { + return $this->setOptionLegacy( 'cleanSignatures', $x ); + } + + /** + * Target attribute for external links + * @return string + */ + public function getExternalLinkTarget() { + return $this->getOption( 'externalLinkTarget' ); + } + + /** + * Target attribute for external links + * @param string|null $x New value (null is no change) + * @return string Old value + */ + public function setExternalLinkTarget( $x ) { + return $this->setOptionLegacy( 'externalLinkTarget', $x ); + } + + /** + * Whether content conversion should be disabled + * @return bool + */ + public function getDisableContentConversion() { + return $this->getOption( 'disableContentConversion' ); + } + + /** + * Whether content conversion should be disabled + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function disableContentConversion( $x = true ) { + return $this->setOptionLegacy( 'disableContentConversion', $x ); + } + + /** + * Whether title conversion should be disabled + * @return bool + */ + public function getDisableTitleConversion() { + return $this->getOption( 'disableTitleConversion' ); + } + + /** + * Whether title conversion should be disabled + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function disableTitleConversion( $x = true ) { + return $this->setOptionLegacy( 'disableTitleConversion', $x ); + } + + /** + * Thumb size preferred by the user. + * @return int + */ + public function getThumbSize() { + return $this->getOption( 'thumbsize' ); + } + + /** + * Thumb size preferred by the user. + * @param int|null $x New value (null is no change) + * @return int Old value + */ + public function setThumbSize( $x ) { + return $this->setOptionLegacy( 'thumbsize', $x ); + } + + /** + * Thumb size preferred by the user. + * @return int + */ + public function getStubThreshold() { + return $this->getOption( 'stubthreshold' ); + } + + /** + * Thumb size preferred by the user. + * @param int|null $x New value (null is no change) + * @return int Old value + */ + public function setStubThreshold( $x ) { + return $this->setOptionLegacy( 'stubthreshold', $x ); + } + + /** + * Parsing the page for a "preview" operation? + * @return bool + */ + public function getIsPreview() { + return $this->getOption( 'isPreview' ); + } + + /** + * Parsing the page for a "preview" operation? + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setIsPreview( $x ) { + return $this->setOptionLegacy( 'isPreview', $x ); + } + + /** + * Parsing the page for a "preview" operation on a single section? + * @return bool + */ + public function getIsSectionPreview() { + return $this->getOption( 'isSectionPreview' ); + } + + /** + * Parsing the page for a "preview" operation on a single section? + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setIsSectionPreview( $x ) { + return $this->setOptionLegacy( 'isSectionPreview', $x ); + } + + /** + * Parsing the printable version of the page? + * @return bool + */ + public function getIsPrintable() { + return $this->getOption( 'printable' ); + } + + /** + * Parsing the printable version of the page? + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setIsPrintable( $x ) { + return $this->setOptionLegacy( 'printable', $x ); + } + + /** + * Transform wiki markup when saving the page? + * @return bool + */ + public function getPreSaveTransform() { + return $this->getOption( 'preSaveTransform' ); + } + + /** + * Transform wiki markup when saving the page? + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setPreSaveTransform( $x ) { + return $this->setOptionLegacy( 'preSaveTransform', $x ); + } + + /** + * Date format index + * @return string + */ + public function getDateFormat() { + return $this->getOption( 'dateformat' ); + } + + /** + * Lazy initializer for dateFormat + */ + private static function initDateFormat( $popt ) { + return $popt->mUser->getDatePreference(); + } + + /** + * Date format index + * @param string|null $x New value (null is no change) + * @return string Old value + */ + public function setDateFormat( $x ) { + return $this->setOptionLegacy( 'dateformat', $x ); + } + + /** + * Get the user language used by the parser for this page and split the parser cache. + * + * @warning: Calling this causes the parser cache to be fragmented by user language! + * To avoid cache fragmentation, output should not depend on the user language. + * Use Parser::getFunctionLang() or Parser::getTargetLanguage() instead! + * + * @note This function will trigger a cache fragmentation by recording the + * 'userlang' option, see optionUsed(). This is done to avoid cache pollution + * when the page is rendered based on the language of the user. + * + * @note When saving, this will return the default language instead of the user's. + * {{int: }} uses this which used to produce inconsistent link tables (T16404). + * + * @return Language + * @since 1.19 + */ + public function getUserLangObj() { + return $this->getOption( 'userlang' ); + } + + /** + * Same as getUserLangObj() but returns a string instead. + * + * @warning: Calling this causes the parser cache to be fragmented by user language! + * To avoid cache fragmentation, output should not depend on the user language. + * Use Parser::getFunctionLang() or Parser::getTargetLanguage() instead! + * + * @see getUserLangObj() + * + * @return string Language code + * @since 1.17 + */ + public function getUserLang() { + return $this->getUserLangObj()->getCode(); + } + + /** + * Set the user language used by the parser for this page and split the parser cache. + * @param string|Language $x New value + * @return Language Old value + */ + public function setUserLang( $x ) { + if ( is_string( $x ) ) { + $x = Language::factory( $x ); + } + + return $this->setOptionLegacy( 'userlang', $x ); + } + + /** + * Are magic ISBN links enabled? + * @since 1.28 + * @return bool + */ + public function getMagicISBNLinks() { + return $this->getOption( 'magicISBNLinks' ); + } + + /** + * Are magic PMID links enabled? + * @since 1.28 + * @return bool + */ + public function getMagicPMIDLinks() { + return $this->getOption( 'magicPMIDLinks' ); + } + /** + * Are magic RFC links enabled? + * @since 1.28 + * @return bool + */ + public function getMagicRFCLinks() { + return $this->getOption( 'magicRFCLinks' ); + } + + /** + * If the wiki is configured to allow raw html ($wgRawHtml = true) + * is it allowed in the specific case of parsing this page. + * + * This is meant to disable unsafe parser tags in cases where + * a malicious user may control the input to the parser. + * + * @note This is expected to be true for normal pages even if the + * wiki has $wgRawHtml disabled in general. The setting only + * signifies that raw html would be unsafe in the current context + * provided that raw html is allowed at all. + * @since 1.29 + * @return bool + */ + public function getAllowUnsafeRawHtml() { + return $this->getOption( 'allowUnsafeRawHtml' ); + } + + /** + * If the wiki is configured to allow raw html ($wgRawHtml = true) + * is it allowed in the specific case of parsing this page. + * @see self::getAllowUnsafeRawHtml() + * @since 1.29 + * @param bool|null $x Value to set or null to get current value + * @return bool Current value for allowUnsafeRawHtml + */ + public function setAllowUnsafeRawHtml( $x ) { + return $this->setOptionLegacy( 'allowUnsafeRawHtml', $x ); + } + + /** + * Class to use to wrap output from Parser::parse() + * @since 1.30 + * @return string|bool + */ + public function getWrapOutputClass() { + return $this->getOption( 'wrapclass' ); + } + + /** + * CSS class to use to wrap output from Parser::parse() + * @since 1.30 + * @param string $className Class name to use for wrapping. + * Passing false to indicate "no wrapping" was deprecated in MediaWiki 1.31. + * @return string|bool Current value + */ + public function setWrapOutputClass( $className ) { + if ( $className === true ) { // DWIM, they probably want the default class name + $className = 'mw-parser-output'; + } + if ( $className === false ) { + wfDeprecated( __METHOD__ . '( false )', '1.31' ); + } + return $this->setOption( 'wrapclass', $className ); + } + + /** + * Callback for current revision fetching; first argument to call_user_func(). + * @since 1.24 + * @return callable + */ + public function getCurrentRevisionCallback() { + return $this->getOption( 'currentRevisionCallback' ); + } + + /** + * Callback for current revision fetching; first argument to call_user_func(). + * @since 1.24 + * @param callable|null $x New value (null is no change) + * @return callable Old value + */ + public function setCurrentRevisionCallback( $x ) { + return $this->setOptionLegacy( 'currentRevisionCallback', $x ); + } + + /** + * Callback for template fetching; first argument to call_user_func(). + * @return callable + */ + public function getTemplateCallback() { + return $this->getOption( 'templateCallback' ); + } + + /** + * Callback for template fetching; first argument to call_user_func(). + * @param callable|null $x New value (null is no change) + * @return callable Old value + */ + public function setTemplateCallback( $x ) { + return $this->setOptionLegacy( 'templateCallback', $x ); + } + + /** + * Callback to generate a guess for {{REVISIONID}} + * @since 1.28 + * @return callable|null + */ + public function getSpeculativeRevIdCallback() { + return $this->getOption( 'speculativeRevIdCallback' ); + } + + /** + * Callback to generate a guess for {{REVISIONID}} + * @since 1.28 + * @param callable|null $x New value (null is no change) + * @return callable|null Old value + */ + public function setSpeculativeRevIdCallback( $x ) { + return $this->setOptionLegacy( 'speculativeRevIdCallback', $x ); + } + + /**@}*/ + + /** + * Timestamp used for {{CURRENTDAY}} etc. + * @return string + */ + public function getTimestamp() { + if ( !isset( $this->mTimestamp ) ) { + $this->mTimestamp = wfTimestampNow(); + } + return $this->mTimestamp; + } + + /** + * Timestamp used for {{CURRENTDAY}} etc. + * @param string|null $x New value (null is no change) + * @return string Old value + */ + public function setTimestamp( $x ) { + return wfSetVar( $this->mTimestamp, $x ); + } + + /** + * Create "edit section" links? + * @deprecated since 1.31, use ParserOutput::getText() options instead. + * @return bool + */ + public function getEditSection() { + wfDeprecated( __METHOD__, '1.31' ); + return true; + } + + /** + * Create "edit section" links? + * @deprecated since 1.31, use ParserOutput::getText() options instead. + * @param bool|null $x New value (null is no change) + * @return bool Old value + */ + public function setEditSection( $x ) { + wfDeprecated( __METHOD__, '1.31' ); + return true; + } + + /** + * Set the redirect target. + * + * Note that setting or changing this does not *make* the page a redirect + * or change its target, it merely records the information for reference + * during the parse. + * + * @since 1.24 + * @param Title|null $title + */ + function setRedirectTarget( $title ) { + $this->redirectTarget = $title; + } + + /** + * Get the previously-set redirect target. + * + * @since 1.24 + * @return Title|null + */ + function getRedirectTarget() { + return $this->redirectTarget; + } + + /** + * Extra key that should be present in the parser cache key. + * @warning Consider registering your additional options with the + * ParserOptionsRegister hook instead of using this method. + * @param string $key + */ + public function addExtraKey( $key ) { + $this->mExtraKey .= '!' . $key; + } + + /** + * Current user + * @return User + */ + public function getUser() { + return $this->mUser; + } + + /** + * @warning For interaction with the parser cache, use + * WikiPage::makeParserOptions(), ContentHandler::makeParserOptions(), or + * ParserOptions::newCanonical() instead. + * @param User $user + * @param Language $lang + */ + public function __construct( $user = null, $lang = null ) { + if ( $user === null ) { + global $wgUser; + if ( $wgUser === null ) { + $user = new User; + } else { + $user = $wgUser; + } + } + if ( $lang === null ) { + global $wgLang; + if ( !StubObject::isRealObject( $wgLang ) ) { + $wgLang->_unstub(); + } + $lang = $wgLang; + } + $this->initialiseFromUser( $user, $lang ); + } + + /** + * Get a ParserOptions object for an anonymous user + * @warning For interaction with the parser cache, use + * WikiPage::makeParserOptions(), ContentHandler::makeParserOptions(), or + * ParserOptions::newCanonical() instead. + * @since 1.27 + * @return ParserOptions + */ + public static function newFromAnon() { + global $wgContLang; + return new ParserOptions( new User, $wgContLang ); + } + + /** + * Get a ParserOptions object from a given user. + * Language will be taken from $wgLang. + * + * @warning For interaction with the parser cache, use + * WikiPage::makeParserOptions(), ContentHandler::makeParserOptions(), or + * ParserOptions::newCanonical() instead. + * @param User $user + * @return ParserOptions + */ + public static function newFromUser( $user ) { + return new ParserOptions( $user ); + } + + /** + * Get a ParserOptions object from a given user and language + * + * @warning For interaction with the parser cache, use + * WikiPage::makeParserOptions(), ContentHandler::makeParserOptions(), or + * ParserOptions::newCanonical() instead. + * @param User $user + * @param Language $lang + * @return ParserOptions + */ + public static function newFromUserAndLang( User $user, Language $lang ) { + return new ParserOptions( $user, $lang ); + } + + /** + * Get a ParserOptions object from a IContextSource object + * + * @warning For interaction with the parser cache, use + * WikiPage::makeParserOptions(), ContentHandler::makeParserOptions(), or + * ParserOptions::newCanonical() instead. + * @param IContextSource $context + * @return ParserOptions + */ + public static function newFromContext( IContextSource $context ) { + return new ParserOptions( $context->getUser(), $context->getLanguage() ); + } + + /** + * Creates a "canonical" ParserOptions object + * + * For historical reasons, certain options have default values that are + * different from the canonical values used for caching. + * + * @since 1.30 + * @param User|null $user + * @param Language|StubObject|null $lang + * @return ParserOptions + */ + public static function newCanonical( User $user = null, $lang = null ) { + $ret = new ParserOptions( $user, $lang ); + foreach ( self::getCanonicalOverrides() as $k => $v ) { + $ret->setOption( $k, $v ); + } + return $ret; + } + + /** + * Get default option values + * @warning If you change the default for an existing option (unless it's + * being overridden by self::getCanonicalOverrides()), all existing parser + * cache entries will be invalid. To avoid bugs, you'll need to handle + * that somehow (e.g. with the RejectParserCacheValue hook) because + * MediaWiki won't do it for you. + * @return array + */ + private static function getDefaults() { + global $wgInterwikiMagic, $wgAllowExternalImages, + $wgAllowExternalImagesFrom, $wgEnableImageWhitelist, $wgAllowSpecialInclusion, + $wgMaxArticleSize, $wgMaxPPNodeCount, $wgMaxTemplateDepth, $wgMaxPPExpandDepth, + $wgCleanSignatures, $wgExternalLinkTarget, $wgExpensiveParserFunctionLimit, + $wgMaxGeneratedPPNodeCount, $wgDisableLangConversion, $wgDisableTitleConversion, + $wgEnableMagicLinks, $wgContLang; + + if ( self::$defaults === null ) { + // *UPDATE* ParserOptions::matches() if any of this changes as needed + self::$defaults = [ + 'dateformat' => null, + 'tidy' => false, + 'interfaceMessage' => false, + 'targetLanguage' => null, + 'removeComments' => true, + 'enableLimitReport' => false, + 'preSaveTransform' => true, + 'isPreview' => false, + 'isSectionPreview' => false, + 'printable' => false, + 'allowUnsafeRawHtml' => true, + 'wrapclass' => 'mw-parser-output', + 'currentRevisionCallback' => [ Parser::class, 'statelessFetchRevision' ], + 'templateCallback' => [ Parser::class, 'statelessFetchTemplate' ], + 'speculativeRevIdCallback' => null, + ]; + + Hooks::run( 'ParserOptionsRegister', [ + &self::$defaults, + &self::$inCacheKey, + &self::$lazyOptions, + ] ); + + ksort( self::$inCacheKey ); + } + + // Unit tests depend on being able to modify the globals at will + return self::$defaults + [ + 'interwikiMagic' => $wgInterwikiMagic, + 'allowExternalImages' => $wgAllowExternalImages, + 'allowExternalImagesFrom' => $wgAllowExternalImagesFrom, + 'enableImageWhitelist' => $wgEnableImageWhitelist, + 'allowSpecialInclusion' => $wgAllowSpecialInclusion, + 'maxIncludeSize' => $wgMaxArticleSize * 1024, + 'maxPPNodeCount' => $wgMaxPPNodeCount, + 'maxGeneratedPPNodeCount' => $wgMaxGeneratedPPNodeCount, + 'maxPPExpandDepth' => $wgMaxPPExpandDepth, + 'maxTemplateDepth' => $wgMaxTemplateDepth, + 'expensiveParserFunctionLimit' => $wgExpensiveParserFunctionLimit, + 'externalLinkTarget' => $wgExternalLinkTarget, + 'cleanSignatures' => $wgCleanSignatures, + 'disableContentConversion' => $wgDisableLangConversion, + 'disableTitleConversion' => $wgDisableLangConversion || $wgDisableTitleConversion, + 'magicISBNLinks' => $wgEnableMagicLinks['ISBN'], + 'magicPMIDLinks' => $wgEnableMagicLinks['PMID'], + 'magicRFCLinks' => $wgEnableMagicLinks['RFC'], + 'numberheadings' => User::getDefaultOption( 'numberheadings' ), + 'thumbsize' => User::getDefaultOption( 'thumbsize' ), + 'stubthreshold' => 0, + 'userlang' => $wgContLang, + ]; + } + + /** + * Get "canonical" non-default option values + * @see self::newCanonical + * @warning If you change the override for an existing option, all existing + * parser cache entries will be invalid. To avoid bugs, you'll need to + * handle that somehow (e.g. with the RejectParserCacheValue hook) because + * MediaWiki won't do it for you. + * @return array + */ + private static function getCanonicalOverrides() { + global $wgEnableParserLimitReporting; + + return [ + 'tidy' => true, + 'enableLimitReport' => $wgEnableParserLimitReporting, + ]; + } + + /** + * Get user options + * + * @param User $user + * @param Language $lang + */ + private function initialiseFromUser( $user, $lang ) { + $this->options = self::getDefaults(); + + $this->mUser = $user; + $this->options['numberheadings'] = $user->getOption( 'numberheadings' ); + $this->options['thumbsize'] = $user->getOption( 'thumbsize' ); + $this->options['stubthreshold'] = $user->getStubThreshold(); + $this->options['userlang'] = $lang; + } + + /** + * Check if these options match that of another options set + * + * This ignores report limit settings that only affect HTML comments + * + * @param ParserOptions $other + * @return bool + * @since 1.25 + */ + public function matches( ParserOptions $other ) { + // Populate lazy options + foreach ( self::$lazyOptions as $name => $callback ) { + if ( $this->options[$name] === null ) { + $this->options[$name] = call_user_func( $callback, $this, $name ); + } + if ( $other->options[$name] === null ) { + $other->options[$name] = call_user_func( $callback, $other, $name ); + } + } + + // Compare most options + $options = array_keys( $this->options ); + $options = array_diff( $options, [ + 'enableLimitReport', // only affects HTML comments + ] ); + foreach ( $options as $option ) { + $o1 = $this->optionToString( $this->options[$option] ); + $o2 = $this->optionToString( $other->options[$option] ); + if ( $o1 !== $o2 ) { + return false; + } + } + + // Compare most other fields + $fields = array_keys( get_class_vars( __CLASS__ ) ); + $fields = array_diff( $fields, [ + 'defaults', // static + 'lazyOptions', // static + 'inCacheKey', // static + 'options', // Already checked above + 'onAccessCallback', // only used for ParserOutput option tracking + ] ); + foreach ( $fields as $field ) { + if ( !is_object( $this->$field ) && $this->$field !== $other->$field ) { + return false; + } + } + + return true; + } + + /** + * Registers a callback for tracking which ParserOptions which are used. + * This is a private API with the parser. + * @param callable $callback + */ + public function registerWatcher( $callback ) { + $this->onAccessCallback = $callback; + } + + /** + * Called when an option is accessed. + * Calls the watcher that was set using registerWatcher(). + * Typically, the watcher callback is ParserOutput::registerOption(). + * The information registered that way will be used by ParserCache::save(). + * + * @param string $optionName Name of the option + */ + public function optionUsed( $optionName ) { + if ( $this->onAccessCallback ) { + call_user_func( $this->onAccessCallback, $optionName ); + } + } + + /** + * Returns the full array of options that would have been used by + * in 1.16. + * Used to get the old parser cache entries when available. + * @deprecated since 1.30. You probably want self::allCacheVaryingOptions() instead. + * @return array + */ + public static function legacyOptions() { + wfDeprecated( __METHOD__, '1.30' ); + return [ + 'stubthreshold', + 'numberheadings', + 'userlang', + 'thumbsize', + 'editsection', + 'printable' + ]; + } + + /** + * Return all option keys that vary the options hash + * @since 1.30 + * @return string[] + */ + public static function allCacheVaryingOptions() { + // Trigger a call to the 'ParserOptionsRegister' hook if it hasn't + // already been called. + if ( self::$defaults === null ) { + self::getDefaults(); + } + return array_keys( array_filter( self::$inCacheKey ) ); + } + + /** + * Convert an option to a string value + * @param mixed $value + * @return string + */ + private function optionToString( $value ) { + if ( $value === true ) { + return '1'; + } elseif ( $value === false ) { + return '0'; + } elseif ( $value === null ) { + return ''; + } elseif ( $value instanceof Language ) { + return $value->getCode(); + } elseif ( is_array( $value ) ) { + return '[' . implode( ',', array_map( [ $this, 'optionToString' ], $value ) ) . ']'; + } else { + return (string)$value; + } + } + + /** + * Generate a hash string with the values set on these ParserOptions + * for the keys given in the array. + * This will be used as part of the hash key for the parser cache, + * so users sharing the options with vary for the same page share + * the same cached data safely. + * + * @since 1.17 + * @param array $forOptions + * @param Title $title Used to get the content language of the page (since r97636) + * @return string Page rendering hash + */ + public function optionsHash( $forOptions, $title = null ) { + global $wgRenderHashAppend; + + $options = $this->options; + $defaults = self::getCanonicalOverrides() + self::getDefaults(); + $inCacheKey = self::$inCacheKey; + + // We only include used options with non-canonical values in the key + // so adding a new option doesn't invalidate the entire parser cache. + // The drawback to this is that changing the default value of an option + // requires manual invalidation of existing cache entries, as mentioned + // in the docs on the relevant methods and hooks. + $values = []; + foreach ( $inCacheKey as $option => $include ) { + if ( $include && in_array( $option, $forOptions, true ) ) { + $v = $this->optionToString( $options[$option] ); + $d = $this->optionToString( $defaults[$option] ); + if ( $v !== $d ) { + $values[] = "$option=$v"; + } + } + } + + $confstr = $values ? implode( '!', $values ) : 'canonical'; + + // add in language specific options, if any + // @todo FIXME: This is just a way of retrieving the url/user preferred variant + if ( !is_null( $title ) ) { + $confstr .= $title->getPageLanguage()->getExtraHashOptions(); + } else { + global $wgContLang; + $confstr .= $wgContLang->getExtraHashOptions(); + } + + $confstr .= $wgRenderHashAppend; + + if ( $this->mExtraKey != '' ) { + $confstr .= $this->mExtraKey; + } + + // Give a chance for extensions to modify the hash, if they have + // extra options or other effects on the parser cache. + Hooks::run( 'PageRenderingHash', [ &$confstr, $this->getUser(), &$forOptions ] ); + + // Make it a valid memcached key fragment + $confstr = str_replace( ' ', '_', $confstr ); + + return $confstr; + } + + /** + * Test whether these options are safe to cache + * @since 1.30 + * @return bool + */ + public function isSafeToCache() { + $defaults = self::getCanonicalOverrides() + self::getDefaults(); + foreach ( $this->options as $option => $value ) { + if ( empty( self::$inCacheKey[$option] ) ) { + $v = $this->optionToString( $value ); + $d = $this->optionToString( $defaults[$option] ); + if ( $v !== $d ) { + return false; + } + } + } + return true; + } + + /** + * Sets a hook to force that a page exists, and sets a current revision callback to return + * a revision with custom content when the current revision of the page is requested. + * + * @since 1.25 + * @param Title $title + * @param Content $content + * @param User $user The user that the fake revision is attributed to + * @return ScopedCallback to unset the hook + */ + public function setupFakeRevision( $title, $content, $user ) { + $oldCallback = $this->setCurrentRevisionCallback( + function ( + $titleToCheck, $parser = false ) use ( $title, $content, $user, &$oldCallback + ) { + if ( $titleToCheck->equals( $title ) ) { + return new Revision( [ + 'page' => $title->getArticleID(), + 'user_text' => $user->getName(), + 'user' => $user->getId(), + 'parent_id' => $title->getLatestRevID(), + 'title' => $title, + 'content' => $content + ] ); + } else { + return call_user_func( $oldCallback, $titleToCheck, $parser ); + } + } + ); + + global $wgHooks; + $wgHooks['TitleExists'][] = + function ( $titleToCheck, &$exists ) use ( $title ) { + if ( $titleToCheck->equals( $title ) ) { + $exists = true; + } + }; + end( $wgHooks['TitleExists'] ); + $key = key( $wgHooks['TitleExists'] ); + LinkCache::singleton()->clearBadLink( $title->getPrefixedDBkey() ); + return new ScopedCallback( function () use ( $title, $key ) { + global $wgHooks; + unset( $wgHooks['TitleExists'][$key] ); + LinkCache::singleton()->clearLink( $title ); + } ); + } +} + +/** + * For really cool vim folding this needs to be at the end: + * vim: foldmarker=@{,@} foldmethod=marker + */ diff --git a/www/wiki/includes/parser/ParserOutput.php b/www/wiki/includes/parser/ParserOutput.php new file mode 100644 index 00000000..8f0a1d7c --- /dev/null +++ b/www/wiki/includes/parser/ParserOutput.php @@ -0,0 +1,1219 @@ +<?php + +/** + * Output of the PHP parser. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ + +class ParserOutput extends CacheTime { + /** + * Feature flags to indicate to extensions that MediaWiki core supports and + * uses getText() stateless transforms. + */ + const SUPPORTS_STATELESS_TRANSFORMS = 1; + const SUPPORTS_UNWRAP_TRANSFORM = 1; + + /** + * @var string $mText The output text + */ + public $mText; + + /** + * @var array $mLanguageLinks List of the full text of language links, + * in the order they appear. + */ + public $mLanguageLinks; + + /** + * @var array $mCategoriesMap of category names to sort keys + */ + public $mCategories; + + /** + * @var array $mIndicators Page status indicators, usually displayed in top-right corner. + */ + public $mIndicators = []; + + /** + * @var string $mTitleText Title text of the chosen language variant, as HTML. + */ + public $mTitleText; + + /** + * @var array $mLinks 2-D map of NS/DBK to ID for the links in the document. + * ID=zero for broken. + */ + public $mLinks = []; + + /** + * @var array $mTemplates 2-D map of NS/DBK to ID for the template references. + * ID=zero for broken. + */ + public $mTemplates = []; + + /** + * @var array $mTemplateIds 2-D map of NS/DBK to rev ID for the template references. + * ID=zero for broken. + */ + public $mTemplateIds = []; + + /** + * @var array $mImages DB keys of the images used, in the array key only + */ + public $mImages = []; + + /** + * @var array $mFileSearchOptions DB keys of the images used mapped to sha1 and MW timestamp. + */ + public $mFileSearchOptions = []; + + /** + * @var array $mExternalLinks External link URLs, in the key only. + */ + public $mExternalLinks = []; + + /** + * @var array $mInterwikiLinks 2-D map of prefix/DBK (in keys only) + * for the inline interwiki links in the document. + */ + public $mInterwikiLinks = []; + + /** + * @var bool $mNewSection Show a new section link? + */ + public $mNewSection = false; + + /** + * @var bool $mHideNewSection Hide the new section link? + */ + public $mHideNewSection = false; + + /** + * @var bool $mNoGallery No gallery on category page? (__NOGALLERY__). + */ + public $mNoGallery = false; + + /** + * @var array $mHeadItems Items to put in the <head> section + */ + public $mHeadItems = []; + + /** + * @var array $mModules Modules to be loaded by ResourceLoader + */ + public $mModules = []; + + /** + * @var array $mModuleScripts Modules of which only the JS will be loaded by ResourceLoader. + */ + public $mModuleScripts = []; + + /** + * @var array $mModuleStyles Modules of which only the CSSS will be loaded by ResourceLoader. + */ + public $mModuleStyles = []; + + /** + * @var array $mJsConfigVars JavaScript config variable for mw.config combined with this page. + */ + public $mJsConfigVars = []; + + /** + * @var array $mOutputHooks Hook tags as per $wgParserOutputHooks. + */ + public $mOutputHooks = []; + + /** + * @var array $mWarnings Warning text to be returned to the user. + * Wikitext formatted, in the key only. + */ + public $mWarnings = []; + + /** + * @var array $mSections Table of contents + */ + public $mSections = []; + + /** + * @var array $mProperties Name/value pairs to be cached in the DB. + */ + public $mProperties = []; + + /** + * @var string $mTOCHTML HTML of the TOC. + */ + public $mTOCHTML = ''; + + /** + * @var string $mTimestamp Timestamp of the revision. + */ + public $mTimestamp; + + /** + * @var bool $mEnableOOUI Whether OOUI should be enabled. + */ + public $mEnableOOUI = false; + + /** + * @var string $mIndexPolicy 'index' or 'noindex'? Any other value will result in no change. + */ + private $mIndexPolicy = ''; + + /** + * @var array $mAccessedOptions List of ParserOptions (stored in the keys). + */ + private $mAccessedOptions = []; + + /** + * @var array $mExtensionData extra data used by extensions. + */ + private $mExtensionData = []; + + /** + * @var array $mLimitReportData Parser limit report data. + */ + private $mLimitReportData = []; + + /** @var array Parser limit report data for JSON */ + private $mLimitReportJSData = []; + + /** + * @var array $mParseStartTime Timestamps for getTimeSinceStart(). + */ + private $mParseStartTime = []; + + /** + * @var bool $mPreventClickjacking Whether to emit X-Frame-Options: DENY. + */ + private $mPreventClickjacking = false; + + /** + * @var array $mFlags Generic flags. + */ + private $mFlags = []; + + /** @var int|null Assumed rev ID for {{REVISIONID}} if no revision is set */ + private $mSpeculativeRevId; + + /** @var int Upper bound of expiry based on parse duration */ + private $mMaxAdaptiveExpiry = INF; + + const EDITSECTION_REGEX = + '#<(?:mw:)?editsection page="(.*?)" section="(.*?)"(?:/>|>(.*?)(</(?:mw:)?editsection>))#s'; + + // finalizeAdaptiveCacheExpiry() uses TTL = MAX( m * PARSE_TIME + b, MIN_AR_TTL) + // Current values imply that m=3933.333333 and b=-333.333333 + // See https://www.nngroup.com/articles/website-response-times/ + const PARSE_FAST_SEC = 0.100; // perceived "fast" page parse + const PARSE_SLOW_SEC = 1.0; // perceived "slow" page parse + const FAST_AR_TTL = 60; // adaptive TTL for "fast" pages + const SLOW_AR_TTL = 3600; // adaptive TTL for "slow" pages + const MIN_AR_TTL = 15; // min adaptive TTL (for sanity, pool counter, and edit stashing) + + public function __construct( $text = '', $languageLinks = [], $categoryLinks = [], + $unused = false, $titletext = '' + ) { + $this->mText = $text; + $this->mLanguageLinks = $languageLinks; + $this->mCategories = $categoryLinks; + $this->mTitleText = $titletext; + } + + /** + * Get the cacheable text with <mw:editsection> markers still in it. The + * return value is suitable for writing back via setText() but is not valid + * for display to the user. + * + * @return string + * @since 1.27 + */ + public function getRawText() { + return $this->mText; + } + + /** + * Get the output HTML + * + * @param array $options (since 1.31) Transformations to apply to the HTML + * - allowTOC: (bool) Show the TOC, assuming there were enough headings + * to generate one and `__NOTOC__` wasn't used. Default is true, + * but might be statefully overridden. + * - enableSectionEditLinks: (bool) Include section edit links, assuming + * section edit link tokens are present in the HTML. Default is true, + * but might be statefully overridden. + * - unwrap: (bool) Remove a wrapping mw-parser-output div. Default is false. + * - deduplicateStyles: (bool) When true, which is the default, `<style>` + * tags with the `data-mw-deduplicate` attribute set are deduplicated by + * value of the attribute: all but the first will be replaced by `<link + * rel="mw-deduplicated-inline-style" href="mw-data:..."/>` tags, where + * the scheme-specific-part of the href is the (percent-encoded) value + * of the `data-mw-deduplicate` attribute. + * @return string HTML + */ + public function getText( $options = [] ) { + $options += [ + 'allowTOC' => true, + 'enableSectionEditLinks' => true, + 'unwrap' => false, + 'deduplicateStyles' => true, + ]; + $text = $this->mText; + + Hooks::runWithoutAbort( 'ParserOutputPostCacheTransform', [ $this, &$text, &$options ] ); + + if ( $options['unwrap'] !== false ) { + $start = Html::openElement( 'div', [ + 'class' => 'mw-parser-output' + ] ); + $startLen = strlen( $start ); + $end = Html::closeElement( 'div' ); + $endPos = strrpos( $text, $end ); + $endLen = strlen( $end ); + + if ( substr( $text, 0, $startLen ) === $start && $endPos !== false + // if the closing div is followed by real content, bail out of unwrapping + && preg_match( '/^(?>\s*<!--.*?-->)*\s*$/s', substr( $text, $endPos + $endLen ) ) + ) { + $text = substr( $text, $startLen ); + $text = substr( $text, 0, $endPos - $startLen ) + . substr( $text, $endPos - $startLen + $endLen ); + } + } + + if ( $options['enableSectionEditLinks'] ) { + $text = preg_replace_callback( + self::EDITSECTION_REGEX, + function ( $m ) { + global $wgOut, $wgLang; + $editsectionPage = Title::newFromText( htmlspecialchars_decode( $m[1] ) ); + $editsectionSection = htmlspecialchars_decode( $m[2] ); + $editsectionContent = isset( $m[4] ) ? Sanitizer::decodeCharReferences( $m[3] ) : null; + + if ( !is_object( $editsectionPage ) ) { + throw new MWException( "Bad parser output text." ); + } + + $skin = $wgOut->getSkin(); + return call_user_func_array( + [ $skin, 'doEditSectionLink' ], + [ $editsectionPage, $editsectionSection, + $editsectionContent, $wgLang->getCode() ] + ); + }, + $text + ); + } else { + $text = preg_replace( self::EDITSECTION_REGEX, '', $text ); + } + + if ( $options['allowTOC'] ) { + $text = str_replace( [ Parser::TOC_START, Parser::TOC_END ], '', $text ); + } else { + $text = preg_replace( + '#' . preg_quote( Parser::TOC_START, '#' ) . '.*?' . preg_quote( Parser::TOC_END, '#' ) . '#s', + '', + $text + ); + } + + if ( $options['deduplicateStyles'] ) { + $seen = []; + $text = preg_replace_callback( + '#<style\s+([^>]*data-mw-deduplicate\s*=[^>]*)>.*?</style>#s', + function ( $m ) use ( &$seen ) { + $attr = Sanitizer::decodeTagAttributes( $m[1] ); + if ( !isset( $attr['data-mw-deduplicate'] ) ) { + return $m[0]; + } + + $key = $attr['data-mw-deduplicate']; + if ( !isset( $seen[$key] ) ) { + $seen[$key] = true; + return $m[0]; + } + + // We were going to use an empty <style> here, but there + // was concern that would be too much overhead for browsers. + // So let's hope a <link> with a non-standard rel and href isn't + // going to be misinterpreted or mangled by any subsequent processing. + return Html::element( 'link', [ + 'rel' => 'mw-deduplicated-inline-style', + 'href' => "mw-data:" . wfUrlencode( $key ), + ] ); + }, + $text + ); + } + + return $text; + } + + /** + * @param int $id + * @since 1.28 + */ + public function setSpeculativeRevIdUsed( $id ) { + $this->mSpeculativeRevId = $id; + } + + /** + * @return int|null + * @since 1.28 + */ + public function getSpeculativeRevIdUsed() { + return $this->mSpeculativeRevId; + } + + public function &getLanguageLinks() { + return $this->mLanguageLinks; + } + + public function getInterwikiLinks() { + return $this->mInterwikiLinks; + } + + public function getCategoryLinks() { + return array_keys( $this->mCategories ); + } + + public function &getCategories() { + return $this->mCategories; + } + + /** + * @return array + * @since 1.25 + */ + public function getIndicators() { + return $this->mIndicators; + } + + public function getTitleText() { + return $this->mTitleText; + } + + public function getSections() { + return $this->mSections; + } + + /** + * @deprecated since 1.31 Use getText() options. + */ + public function getEditSectionTokens() { + wfDeprecated( __METHOD__, '1.31' ); + return true; + } + + public function &getLinks() { + return $this->mLinks; + } + + public function &getTemplates() { + return $this->mTemplates; + } + + public function &getTemplateIds() { + return $this->mTemplateIds; + } + + public function &getImages() { + return $this->mImages; + } + + public function &getFileSearchOptions() { + return $this->mFileSearchOptions; + } + + public function &getExternalLinks() { + return $this->mExternalLinks; + } + + public function getNoGallery() { + return $this->mNoGallery; + } + + public function getHeadItems() { + return $this->mHeadItems; + } + + public function getModules() { + return $this->mModules; + } + + public function getModuleScripts() { + return $this->mModuleScripts; + } + + public function getModuleStyles() { + return $this->mModuleStyles; + } + + /** + * @return array + * @since 1.23 + */ + public function getJsConfigVars() { + return $this->mJsConfigVars; + } + + public function getOutputHooks() { + return (array)$this->mOutputHooks; + } + + public function getWarnings() { + return array_keys( $this->mWarnings ); + } + + public function getIndexPolicy() { + return $this->mIndexPolicy; + } + + public function getTOCHTML() { + return $this->mTOCHTML; + } + + /** + * @return string|null TS_MW timestamp of the revision content + */ + public function getTimestamp() { + return $this->mTimestamp; + } + + public function getLimitReportData() { + return $this->mLimitReportData; + } + + public function getLimitReportJSData() { + return $this->mLimitReportJSData; + } + + /** + * @deprecated since 1.31 Use getText() options. + */ + public function getTOCEnabled() { + wfDeprecated( __METHOD__, '1.31' ); + return true; + } + + public function getEnableOOUI() { + return $this->mEnableOOUI; + } + + public function setText( $text ) { + return wfSetVar( $this->mText, $text ); + } + + public function setLanguageLinks( $ll ) { + return wfSetVar( $this->mLanguageLinks, $ll ); + } + + public function setCategoryLinks( $cl ) { + return wfSetVar( $this->mCategories, $cl ); + } + + public function setTitleText( $t ) { + return wfSetVar( $this->mTitleText, $t ); + } + + public function setSections( $toc ) { + return wfSetVar( $this->mSections, $toc ); + } + + /** + * @deprecated since 1.31 Use getText() options. + */ + public function setEditSectionTokens( $t ) { + wfDeprecated( __METHOD__, '1.31' ); + return true; + } + + public function setIndexPolicy( $policy ) { + return wfSetVar( $this->mIndexPolicy, $policy ); + } + + public function setTOCHTML( $tochtml ) { + return wfSetVar( $this->mTOCHTML, $tochtml ); + } + + public function setTimestamp( $timestamp ) { + return wfSetVar( $this->mTimestamp, $timestamp ); + } + + /** + * @deprecated since 1.31 Use getText() options. + */ + public function setTOCEnabled( $flag ) { + wfDeprecated( __METHOD__, '1.31' ); + return true; + } + + public function addCategory( $c, $sort ) { + $this->mCategories[$c] = $sort; + } + + /** + * @param string $id + * @param string $content + * @since 1.25 + */ + public function setIndicator( $id, $content ) { + $this->mIndicators[$id] = $content; + } + + /** + * Enables OOUI, if true, in any OutputPage instance this ParserOutput + * object is added to. + * + * @since 1.26 + * @param bool $enable If OOUI should be enabled or not + */ + public function setEnableOOUI( $enable = false ) { + $this->mEnableOOUI = $enable; + } + + public function addLanguageLink( $t ) { + $this->mLanguageLinks[] = $t; + } + + public function addWarning( $s ) { + $this->mWarnings[$s] = 1; + } + + public function addOutputHook( $hook, $data = false ) { + $this->mOutputHooks[] = [ $hook, $data ]; + } + + public function setNewSection( $value ) { + $this->mNewSection = (bool)$value; + } + public function hideNewSection( $value ) { + $this->mHideNewSection = (bool)$value; + } + public function getHideNewSection() { + return (bool)$this->mHideNewSection; + } + public function getNewSection() { + return (bool)$this->mNewSection; + } + + /** + * Checks, if a url is pointing to the own server + * + * @param string $internal The server to check against + * @param string $url The url to check + * @return bool + */ + public static function isLinkInternal( $internal, $url ) { + return (bool)preg_match( '/^' . + # If server is proto relative, check also for http/https links + ( substr( $internal, 0, 2 ) === '//' ? '(?:https?:)?' : '' ) . + preg_quote( $internal, '/' ) . + # check for query/path/anchor or end of link in each case + '(?:[\?\/\#]|$)/i', + $url + ); + } + + public function addExternalLink( $url ) { + # We don't register links pointing to our own server, unless... :-) + global $wgServer, $wgRegisterInternalExternals; + + # Replace unnecessary URL escape codes with the referenced character + # This prevents spammers from hiding links from the filters + $url = Parser::normalizeLinkUrl( $url ); + + $registerExternalLink = true; + if ( !$wgRegisterInternalExternals ) { + $registerExternalLink = !self::isLinkInternal( $wgServer, $url ); + } + if ( $registerExternalLink ) { + $this->mExternalLinks[$url] = 1; + } + } + + /** + * Record a local or interwiki inline link for saving in future link tables. + * + * @param Title $title + * @param int|null $id Optional known page_id so we can skip the lookup + */ + public function addLink( Title $title, $id = null ) { + if ( $title->isExternal() ) { + // Don't record interwikis in pagelinks + $this->addInterwikiLink( $title ); + return; + } + $ns = $title->getNamespace(); + $dbk = $title->getDBkey(); + if ( $ns == NS_MEDIA ) { + // Normalize this pseudo-alias if it makes it down here... + $ns = NS_FILE; + } elseif ( $ns == NS_SPECIAL ) { + // We don't record Special: links currently + // It might actually be wise to, but we'd need to do some normalization. + return; + } elseif ( $dbk === '' ) { + // Don't record self links - [[#Foo]] + return; + } + if ( !isset( $this->mLinks[$ns] ) ) { + $this->mLinks[$ns] = []; + } + if ( is_null( $id ) ) { + $id = $title->getArticleID(); + } + $this->mLinks[$ns][$dbk] = $id; + } + + /** + * Register a file dependency for this output + * @param string $name Title dbKey + * @param string $timestamp MW timestamp of file creation (or false if non-existing) + * @param string $sha1 Base 36 SHA-1 of file (or false if non-existing) + * @return void + */ + public function addImage( $name, $timestamp = null, $sha1 = null ) { + $this->mImages[$name] = 1; + if ( $timestamp !== null && $sha1 !== null ) { + $this->mFileSearchOptions[$name] = [ 'time' => $timestamp, 'sha1' => $sha1 ]; + } + } + + /** + * Register a template dependency for this output + * @param Title $title + * @param int $page_id + * @param int $rev_id + * @return void + */ + public function addTemplate( $title, $page_id, $rev_id ) { + $ns = $title->getNamespace(); + $dbk = $title->getDBkey(); + if ( !isset( $this->mTemplates[$ns] ) ) { + $this->mTemplates[$ns] = []; + } + $this->mTemplates[$ns][$dbk] = $page_id; + if ( !isset( $this->mTemplateIds[$ns] ) ) { + $this->mTemplateIds[$ns] = []; + } + $this->mTemplateIds[$ns][$dbk] = $rev_id; // For versioning + } + + /** + * @param Title $title Title object, must be an interwiki link + * @throws MWException If given invalid input + */ + public function addInterwikiLink( $title ) { + if ( !$title->isExternal() ) { + throw new MWException( 'Non-interwiki link passed, internal parser error.' ); + } + $prefix = $title->getInterwiki(); + if ( !isset( $this->mInterwikiLinks[$prefix] ) ) { + $this->mInterwikiLinks[$prefix] = []; + } + $this->mInterwikiLinks[$prefix][$title->getDBkey()] = 1; + } + + /** + * Add some text to the "<head>". + * If $tag is set, the section with that tag will only be included once + * in a given page. + * @param string $section + * @param string|bool $tag + */ + public function addHeadItem( $section, $tag = false ) { + if ( $tag !== false ) { + $this->mHeadItems[$tag] = $section; + } else { + $this->mHeadItems[] = $section; + } + } + + public function addModules( $modules ) { + $this->mModules = array_merge( $this->mModules, (array)$modules ); + } + + public function addModuleScripts( $modules ) { + $this->mModuleScripts = array_merge( $this->mModuleScripts, (array)$modules ); + } + + public function addModuleStyles( $modules ) { + $this->mModuleStyles = array_merge( $this->mModuleStyles, (array)$modules ); + } + + /** + * Add one or more variables to be set in mw.config in JavaScript. + * + * @param string|array $keys Key or array of key/value pairs. + * @param mixed $value [optional] Value of the configuration variable. + * @since 1.23 + */ + public function addJsConfigVars( $keys, $value = null ) { + if ( is_array( $keys ) ) { + foreach ( $keys as $key => $value ) { + $this->mJsConfigVars[$key] = $value; + } + return; + } + + $this->mJsConfigVars[$keys] = $value; + } + + /** + * Copy items from the OutputPage object into this one + * + * @param OutputPage $out + */ + public function addOutputPageMetadata( OutputPage $out ) { + $this->addModules( $out->getModules() ); + $this->addModuleScripts( $out->getModuleScripts() ); + $this->addModuleStyles( $out->getModuleStyles() ); + $this->addJsConfigVars( $out->getJsConfigVars() ); + + $this->mHeadItems = array_merge( $this->mHeadItems, $out->getHeadItemsArray() ); + $this->mPreventClickjacking = $this->mPreventClickjacking || $out->getPreventClickjacking(); + } + + /** + * Add a tracking category, getting the title from a system message, + * or print a debug message if the title is invalid. + * + * Any message used with this function should be registered so it will + * show up on Special:TrackingCategories. Core messages should be added + * to SpecialTrackingCategories::$coreTrackingCategories, and extensions + * should add to "TrackingCategories" in their extension.json. + * + * @todo Migrate some code to TrackingCategories + * + * @param string $msg Message key + * @param Title $title title of the page which is being tracked + * @return bool Whether the addition was successful + * @since 1.25 + */ + public function addTrackingCategory( $msg, $title ) { + if ( $title->isSpecialPage() ) { + wfDebug( __METHOD__ . ": Not adding tracking category $msg to special page!\n" ); + return false; + } + + // Important to parse with correct title (T33469) + $cat = wfMessage( $msg ) + ->title( $title ) + ->inContentLanguage() + ->text(); + + # Allow tracking categories to be disabled by setting them to "-" + if ( $cat === '-' ) { + return false; + } + + $containerCategory = Title::makeTitleSafe( NS_CATEGORY, $cat ); + if ( $containerCategory ) { + $this->addCategory( $containerCategory->getDBkey(), $this->getProperty( 'defaultsort' ) ?: '' ); + return true; + } else { + wfDebug( __METHOD__ . ": [[MediaWiki:$msg]] is not a valid title!\n" ); + return false; + } + } + + /** + * Override the title to be used for display + * + * @note this is assumed to have been validated + * (check equal normalisation, etc.) + * + * @note this is expected to be safe HTML, + * ready to be served to the client. + * + * @param string $text Desired title text + */ + public function setDisplayTitle( $text ) { + $this->setTitleText( $text ); + $this->setProperty( 'displaytitle', $text ); + } + + /** + * Get the title to be used for display. + * + * As per the contract of setDisplayTitle(), this is safe HTML, + * ready to be served to the client. + * + * @return string HTML + */ + public function getDisplayTitle() { + $t = $this->getTitleText(); + if ( $t === '' ) { + return false; + } + return $t; + } + + /** + * Fairly generic flag setter thingy. + * @param string $flag + */ + public function setFlag( $flag ) { + $this->mFlags[$flag] = true; + } + + public function getFlag( $flag ) { + return isset( $this->mFlags[$flag] ); + } + + /** + * Set a property to be stored in the page_props database table. + * + * page_props is a key value store indexed by the page ID. This allows + * the parser to set a property on a page which can then be quickly + * retrieved given the page ID or via a DB join when given the page + * title. + * + * Since 1.23, page_props are also indexed by numeric value, to allow + * for efficient "top k" queries of pages wrt a given property. + * + * setProperty() is thus used to propagate properties from the parsed + * page to request contexts other than a page view of the currently parsed + * article. + * + * Some applications examples: + * + * * To implement hidden categories, hiding pages from category listings + * by storing a property. + * + * * Overriding the displayed article title. + * @see ParserOutput::setDisplayTitle() + * + * * To implement image tagging, for example displaying an icon on an + * image thumbnail to indicate that it is listed for deletion on + * Wikimedia Commons. + * This is not actually implemented, yet but would be pretty cool. + * + * @note Do not use setProperty() to set a property which is only used + * in a context where the ParserOutput object itself is already available, + * for example a normal page view. There is no need to save such a property + * in the database since the text is already parsed. You can just hook + * OutputPageParserOutput and get your data out of the ParserOutput object. + * + * If you are writing an extension where you want to set a property in the + * parser which is used by an OutputPageParserOutput hook, you have to + * associate the extension data directly with the ParserOutput object. + * Since MediaWiki 1.21, you can use setExtensionData() to do this: + * + * @par Example: + * @code + * $parser->getOutput()->setExtensionData( 'my_ext_foo', '...' ); + * @endcode + * + * And then later, in OutputPageParserOutput or similar: + * + * @par Example: + * @code + * $output->getExtensionData( 'my_ext_foo' ); + * @endcode + * + * In MediaWiki 1.20 and older, you have to use a custom member variable + * within the ParserOutput object: + * + * @par Example: + * @code + * $parser->getOutput()->my_ext_foo = '...'; + * @endcode + * @param string $name + * @param mixed $value + */ + public function setProperty( $name, $value ) { + $this->mProperties[$name] = $value; + } + + /** + * @param string $name The property name to look up. + * + * @return mixed|bool The value previously set using setProperty(). False if null or no value + * was set for the given property name. + * + * @note You need to use getProperties() to check for boolean and null properties. + */ + public function getProperty( $name ) { + return isset( $this->mProperties[$name] ) ? $this->mProperties[$name] : false; + } + + public function unsetProperty( $name ) { + unset( $this->mProperties[$name] ); + } + + public function getProperties() { + if ( !isset( $this->mProperties ) ) { + $this->mProperties = []; + } + return $this->mProperties; + } + + /** + * Returns the options from its ParserOptions which have been taken + * into account to produce this output or false if not available. + * @return array + */ + public function getUsedOptions() { + if ( !isset( $this->mAccessedOptions ) ) { + return []; + } + return array_keys( $this->mAccessedOptions ); + } + + /** + * Tags a parser option for use in the cache key for this parser output. + * Registered as a watcher at ParserOptions::registerWatcher() by Parser::clearState(). + * The information gathered here is available via getUsedOptions(), + * and is used by ParserCache::save(). + * + * @see ParserCache::getKey + * @see ParserCache::save + * @see ParserOptions::addExtraKey + * @see ParserOptions::optionsHash + * @param string $option + */ + public function recordOption( $option ) { + $this->mAccessedOptions[$option] = true; + } + + /** + * Attaches arbitrary data to this ParserObject. This can be used to store some information in + * the ParserOutput object for later use during page output. The data will be cached along with + * the ParserOutput object, but unlike data set using setProperty(), it is not recorded in the + * database. + * + * This method is provided to overcome the unsafe practice of attaching extra information to a + * ParserObject by directly assigning member variables. + * + * To use setExtensionData() to pass extension information from a hook inside the parser to a + * hook in the page output, use this in the parser hook: + * + * @par Example: + * @code + * $parser->getOutput()->setExtensionData( 'my_ext_foo', '...' ); + * @endcode + * + * And then later, in OutputPageParserOutput or similar: + * + * @par Example: + * @code + * $output->getExtensionData( 'my_ext_foo' ); + * @endcode + * + * In MediaWiki 1.20 and older, you have to use a custom member variable + * within the ParserOutput object: + * + * @par Example: + * @code + * $parser->getOutput()->my_ext_foo = '...'; + * @endcode + * + * @since 1.21 + * + * @param string $key The key for accessing the data. Extensions should take care to avoid + * conflicts in naming keys. It is suggested to use the extension's name as a prefix. + * + * @param mixed $value The value to set. Setting a value to null is equivalent to removing + * the value. + */ + public function setExtensionData( $key, $value ) { + if ( $value === null ) { + unset( $this->mExtensionData[$key] ); + } else { + $this->mExtensionData[$key] = $value; + } + } + + /** + * Gets extensions data previously attached to this ParserOutput using setExtensionData(). + * Typically, such data would be set while parsing the page, e.g. by a parser function. + * + * @since 1.21 + * + * @param string $key The key to look up. + * + * @return mixed|null The value previously set for the given key using setExtensionData() + * or null if no value was set for this key. + */ + public function getExtensionData( $key ) { + if ( isset( $this->mExtensionData[$key] ) ) { + return $this->mExtensionData[$key]; + } + + return null; + } + + private static function getTimes( $clock = null ) { + $ret = []; + if ( !$clock || $clock === 'wall' ) { + $ret['wall'] = microtime( true ); + } + if ( !$clock || $clock === 'cpu' ) { + $ru = wfGetRusage(); + if ( $ru ) { + $ret['cpu'] = $ru['ru_utime.tv_sec'] + $ru['ru_utime.tv_usec'] / 1e6; + $ret['cpu'] += $ru['ru_stime.tv_sec'] + $ru['ru_stime.tv_usec'] / 1e6; + } + } + return $ret; + } + + /** + * Resets the parse start timestamps for future calls to getTimeSinceStart() + * @since 1.22 + */ + public function resetParseStartTime() { + $this->mParseStartTime = self::getTimes(); + } + + /** + * Returns the time since resetParseStartTime() was last called + * + * Clocks available are: + * - wall: Wall clock time + * - cpu: CPU time (requires getrusage) + * + * @since 1.22 + * @param string $clock + * @return float|null + */ + public function getTimeSinceStart( $clock ) { + if ( !isset( $this->mParseStartTime[$clock] ) ) { + return null; + } + + $end = self::getTimes( $clock ); + return $end[$clock] - $this->mParseStartTime[$clock]; + } + + /** + * Sets parser limit report data for a key + * + * The key is used as the prefix for various messages used for formatting: + * - $key: The label for the field in the limit report + * - $key-value-text: Message used to format the value in the "NewPP limit + * report" HTML comment. If missing, uses $key-format. + * - $key-value-html: Message used to format the value in the preview + * limit report table. If missing, uses $key-format. + * - $key-value: Message used to format the value. If missing, uses "$1". + * + * Note that all values are interpreted as wikitext, and so should be + * encoded with htmlspecialchars() as necessary, but should avoid complex + * HTML for sanity of display in the "NewPP limit report" comment. + * + * @since 1.22 + * @param string $key Message key + * @param mixed $value Appropriate for Message::params() + */ + public function setLimitReportData( $key, $value ) { + $this->mLimitReportData[$key] = $value; + + if ( is_array( $value ) ) { + if ( array_keys( $value ) === [ 0, 1 ] + && is_numeric( $value[0] ) + && is_numeric( $value[1] ) + ) { + $data = [ 'value' => $value[0], 'limit' => $value[1] ]; + } else { + $data = $value; + } + } else { + $data = $value; + } + + if ( strpos( $key, '-' ) ) { + list( $ns, $name ) = explode( '-', $key, 2 ); + $this->mLimitReportJSData[$ns][$name] = $data; + } else { + $this->mLimitReportJSData[$key] = $data; + } + } + + /** + * Check whether the cache TTL was lowered due to dynamic content + * + * When content is determined by more than hard state (e.g. page edits), + * such as template/file transclusions based on the current timestamp or + * extension tags that generate lists based on queries, this return true. + * + * @return bool + * @since 1.25 + */ + public function hasDynamicContent() { + global $wgParserCacheExpireTime; + + return $this->getCacheExpiry() < $wgParserCacheExpireTime; + } + + /** + * Get or set the prevent-clickjacking flag + * + * @since 1.24 + * @param bool|null $flag New flag value, or null to leave it unchanged + * @return bool Old flag value + */ + public function preventClickjacking( $flag = null ) { + return wfSetVar( $this->mPreventClickjacking, $flag ); + } + + /** + * Lower the runtime adaptive TTL to at most this value + * + * @param int $ttl + * @since 1.28 + */ + public function updateRuntimeAdaptiveExpiry( $ttl ) { + $this->mMaxAdaptiveExpiry = min( $ttl, $this->mMaxAdaptiveExpiry ); + $this->updateCacheExpiry( $ttl ); + } + + /** + * Call this when parsing is done to lower the TTL based on low parse times + * + * @since 1.28 + */ + public function finalizeAdaptiveCacheExpiry() { + if ( is_infinite( $this->mMaxAdaptiveExpiry ) ) { + return; // not set + } + + $runtime = $this->getTimeSinceStart( 'wall' ); + if ( is_float( $runtime ) ) { + $slope = ( self::SLOW_AR_TTL - self::FAST_AR_TTL ) + / ( self::PARSE_SLOW_SEC - self::PARSE_FAST_SEC ); + // SLOW_AR_TTL = PARSE_SLOW_SEC * $slope + $point + $point = self::SLOW_AR_TTL - self::PARSE_SLOW_SEC * $slope; + + $adaptiveTTL = min( + max( $slope * $runtime + $point, self::MIN_AR_TTL ), + $this->mMaxAdaptiveExpiry + ); + $this->updateCacheExpiry( $adaptiveTTL ); + } + } + + public function __sleep() { + return array_diff( + array_keys( get_object_vars( $this ) ), + [ 'mParseStartTime' ] + ); + } +} diff --git a/www/wiki/includes/parser/Preprocessor.php b/www/wiki/includes/parser/Preprocessor.php new file mode 100644 index 00000000..49e961ae --- /dev/null +++ b/www/wiki/includes/parser/Preprocessor.php @@ -0,0 +1,436 @@ +<?php +/** + * Interfaces for preprocessors + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ + +use MediaWiki\Logger\LoggerFactory; + +/** + * @ingroup Parser + */ +abstract class Preprocessor { + + const CACHE_VERSION = 1; + + /** + * @var array Brace matching rules. + */ + protected $rules = [ + '{' => [ + 'end' => '}', + 'names' => [ + 2 => 'template', + 3 => 'tplarg', + ], + 'min' => 2, + 'max' => 3, + ], + '[' => [ + 'end' => ']', + 'names' => [ 2 => null ], + 'min' => 2, + 'max' => 2, + ], + '-{' => [ + 'end' => '}-', + 'names' => [ 2 => null ], + 'min' => 2, + 'max' => 2, + ], + ]; + + /** + * Store a document tree in the cache. + * + * @param string $text + * @param int $flags + * @param string $tree + */ + protected function cacheSetTree( $text, $flags, $tree ) { + $config = RequestContext::getMain()->getConfig(); + + $length = strlen( $text ); + $threshold = $config->get( 'PreprocessorCacheThreshold' ); + if ( $threshold === false || $length < $threshold || $length > 1e6 ) { + return; + } + + $cache = ObjectCache::getLocalClusterInstance(); + $key = $cache->makeKey( + defined( 'static::CACHE_PREFIX' ) ? static::CACHE_PREFIX : static::class, + md5( $text ), $flags ); + $value = sprintf( "%08d", static::CACHE_VERSION ) . $tree; + + $cache->set( $key, $value, 86400 ); + + LoggerFactory::getInstance( 'Preprocessor' ) + ->info( "Cached preprocessor output (key: $key)" ); + } + + /** + * Attempt to load a precomputed document tree for some given wikitext + * from the cache. + * + * @param string $text + * @param int $flags + * @return PPNode_Hash_Tree|bool + */ + protected function cacheGetTree( $text, $flags ) { + $config = RequestContext::getMain()->getConfig(); + + $length = strlen( $text ); + $threshold = $config->get( 'PreprocessorCacheThreshold' ); + if ( $threshold === false || $length < $threshold || $length > 1e6 ) { + return false; + } + + $cache = ObjectCache::getLocalClusterInstance(); + + $key = $cache->makeKey( + defined( 'static::CACHE_PREFIX' ) ? static::CACHE_PREFIX : static::class, + md5( $text ), $flags ); + + $value = $cache->get( $key ); + if ( !$value ) { + return false; + } + + $version = intval( substr( $value, 0, 8 ) ); + if ( $version !== static::CACHE_VERSION ) { + return false; + } + + LoggerFactory::getInstance( 'Preprocessor' ) + ->info( "Loaded preprocessor output from cache (key: $key)" ); + + return substr( $value, 8 ); + } + + /** + * Create a new top-level frame for expansion of a page + * + * @return PPFrame + */ + abstract public function newFrame(); + + /** + * Create a new custom frame for programmatic use of parameter replacement + * as used in some extensions. + * + * @param array $args + * + * @return PPFrame + */ + abstract public function newCustomFrame( $args ); + + /** + * Create a new custom node for programmatic use of parameter replacement + * as used in some extensions. + * + * @param array $values + */ + abstract public function newPartNodeArray( $values ); + + /** + * Preprocess text to a PPNode + * + * @param string $text + * @param int $flags + * + * @return PPNode + */ + abstract public function preprocessToObj( $text, $flags = 0 ); +} + +/** + * @ingroup Parser + */ +interface PPFrame { + const NO_ARGS = 1; + const NO_TEMPLATES = 2; + const STRIP_COMMENTS = 4; + const NO_IGNORE = 8; + const RECOVER_COMMENTS = 16; + const NO_TAGS = 32; + + const RECOVER_ORIG = 59; // = 1|2|8|16|32 no constant expression support in PHP yet + + /** This constant exists when $indexOffset is supported in newChild() */ + const SUPPORTS_INDEX_OFFSET = 1; + + /** + * Create a child frame + * + * @param array|bool $args + * @param bool|Title $title + * @param int $indexOffset A number subtracted from the index attributes of the arguments + * + * @return PPFrame + */ + public function newChild( $args = false, $title = false, $indexOffset = 0 ); + + /** + * Expand a document tree node, caching the result on its parent with the given key + * @param string|int $key + * @param string|PPNode $root + * @param int $flags + * @return string + */ + public function cachedExpand( $key, $root, $flags = 0 ); + + /** + * Expand a document tree node + * @param string|PPNode $root + * @param int $flags + * @return string + */ + public function expand( $root, $flags = 0 ); + + /** + * Implode with flags for expand() + * @param string $sep + * @param int $flags + * @param string|PPNode $args,... + * @return string + */ + public function implodeWithFlags( $sep, $flags /*, ... */ ); + + /** + * Implode with no flags specified + * @param string $sep + * @param string|PPNode $args,... + * @return string + */ + public function implode( $sep /*, ... */ ); + + /** + * Makes an object that, when expand()ed, will be the same as one obtained + * with implode() + * @param string $sep + * @param string|PPNode $args,... + * @return PPNode + */ + public function virtualImplode( $sep /*, ... */ ); + + /** + * Virtual implode with brackets + * @param string $start + * @param string $sep + * @param string $end + * @param string|PPNode $args,... + * @return PPNode + */ + public function virtualBracketedImplode( $start, $sep, $end /*, ... */ ); + + /** + * Returns true if there are no arguments in this frame + * + * @return bool + */ + public function isEmpty(); + + /** + * Returns all arguments of this frame + * @return array + */ + public function getArguments(); + + /** + * Returns all numbered arguments of this frame + * @return array + */ + public function getNumberedArguments(); + + /** + * Returns all named arguments of this frame + * @return array + */ + public function getNamedArguments(); + + /** + * Get an argument to this frame by name + * @param int|string $name + * @return string|bool + */ + public function getArgument( $name ); + + /** + * Returns true if the infinite loop check is OK, false if a loop is detected + * + * @param Title $title + * @return bool + */ + public function loopCheck( $title ); + + /** + * Return true if the frame is a template frame + * @return bool + */ + public function isTemplate(); + + /** + * Set the "volatile" flag. + * + * Note that this is somewhat of a "hack" in order to make extensions + * with side effects (such as Cite) work with the PHP parser. New + * extensions should be written in a way that they do not need this + * function, because other parsers (such as Parsoid) are not guaranteed + * to respect it, and it may be removed in the future. + * + * @param bool $flag + */ + public function setVolatile( $flag = true ); + + /** + * Get the "volatile" flag. + * + * Callers should avoid caching the result of an expansion if it has the + * volatile flag set. + * + * @see self::setVolatile() + * @return bool + */ + public function isVolatile(); + + /** + * Get the TTL of the frame's output. + * + * This is the maximum amount of time, in seconds, that this frame's + * output should be cached for. A value of null indicates that no + * maximum has been specified. + * + * Note that this TTL only applies to caching frames as parts of pages. + * It is not relevant to caching the entire rendered output of a page. + * + * @return int|null + */ + public function getTTL(); + + /** + * Set the TTL of the output of this frame and all of its ancestors. + * Has no effect if the new TTL is greater than the one already set. + * Note that it is the caller's responsibility to change the cache + * expiry of the page as a whole, if such behavior is desired. + * + * @see self::getTTL() + * @param int $ttl + */ + public function setTTL( $ttl ); + + /** + * Get a title of frame + * + * @return Title + */ + public function getTitle(); +} + +/** + * There are three types of nodes: + * * Tree nodes, which have a name and contain other nodes as children + * * Array nodes, which also contain other nodes but aren't considered part of a tree + * * Leaf nodes, which contain the actual data + * + * This interface provides access to the tree structure and to the contents of array nodes, + * but it does not provide access to the internal structure of leaf nodes. Access to leaf + * data is provided via two means: + * * PPFrame::expand(), which provides expanded text + * * The PPNode::split*() functions, which provide metadata about certain types of tree node + * @ingroup Parser + */ +interface PPNode { + /** + * Get an array-type node containing the children of this node. + * Returns false if this is not a tree node. + * @return PPNode + */ + public function getChildren(); + + /** + * Get the first child of a tree node. False if there isn't one. + * + * @return PPNode + */ + public function getFirstChild(); + + /** + * Get the next sibling of any node. False if there isn't one + * @return PPNode + */ + public function getNextSibling(); + + /** + * Get all children of this tree node which have a given name. + * Returns an array-type node, or false if this is not a tree node. + * @param string $type + * @return bool|PPNode + */ + public function getChildrenOfType( $type ); + + /** + * Returns the length of the array, or false if this is not an array-type node + */ + public function getLength(); + + /** + * Returns an item of an array-type node + * @param int $i + * @return bool|PPNode + */ + public function item( $i ); + + /** + * Get the name of this node. The following names are defined here: + * + * h A heading node. + * template A double-brace node. + * tplarg A triple-brace node. + * title The first argument to a template or tplarg node. + * part Subsequent arguments to a template or tplarg node. + * #nodelist An array-type node + * + * The subclass may define various other names for tree and leaf nodes. + * @return string + */ + public function getName(); + + /** + * Split a "<part>" node into an associative array containing: + * name PPNode name + * index String index + * value PPNode value + * @return array + */ + public function splitArg(); + + /** + * Split an "<ext>" node into an associative array containing name, attr, inner and close + * All values in the resulting array are PPNodes. Inner and close are optional. + * @return array + */ + public function splitExt(); + + /** + * Split an "<h>" node + * @return array + */ + public function splitHeading(); +} diff --git a/www/wiki/includes/parser/Preprocessor_DOM.php b/www/wiki/includes/parser/Preprocessor_DOM.php new file mode 100644 index 00000000..64edbb2f --- /dev/null +++ b/www/wiki/includes/parser/Preprocessor_DOM.php @@ -0,0 +1,2054 @@ +<?php +/** + * Preprocessor using PHP's dom extension + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ + +/** + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class Preprocessor_DOM extends Preprocessor { + + /** + * @var Parser + */ + public $parser; + + public $memoryLimit; + + const CACHE_PREFIX = 'preprocess-xml'; + + public function __construct( $parser ) { + $this->parser = $parser; + $mem = ini_get( 'memory_limit' ); + $this->memoryLimit = false; + if ( strval( $mem ) !== '' && $mem != -1 ) { + if ( preg_match( '/^\d+$/', $mem ) ) { + $this->memoryLimit = $mem; + } elseif ( preg_match( '/^(\d+)M$/i', $mem, $m ) ) { + $this->memoryLimit = $m[1] * 1048576; + } + } + } + + /** + * @return PPFrame_DOM + */ + public function newFrame() { + return new PPFrame_DOM( $this ); + } + + /** + * @param array $args + * @return PPCustomFrame_DOM + */ + public function newCustomFrame( $args ) { + return new PPCustomFrame_DOM( $this, $args ); + } + + /** + * @param array $values + * @return PPNode_DOM + * @throws MWException + */ + public function newPartNodeArray( $values ) { + // NOTE: DOM manipulation is slower than building & parsing XML! (or so Tim sais) + $xml = "<list>"; + + foreach ( $values as $k => $val ) { + if ( is_int( $k ) ) { + $xml .= "<part><name index=\"$k\"/><value>" + . htmlspecialchars( $val ) . "</value></part>"; + } else { + $xml .= "<part><name>" . htmlspecialchars( $k ) + . "</name>=<value>" . htmlspecialchars( $val ) . "</value></part>"; + } + } + + $xml .= "</list>"; + + $dom = new DOMDocument(); + Wikimedia\suppressWarnings(); + $result = $dom->loadXML( $xml ); + Wikimedia\restoreWarnings(); + if ( !$result ) { + // Try running the XML through UtfNormal to get rid of invalid characters + $xml = UtfNormal\Validator::cleanUp( $xml ); + // 1 << 19 == XML_PARSE_HUGE, needed so newer versions of libxml2 + // don't barf when the XML is >256 levels deep + $result = $dom->loadXML( $xml, 1 << 19 ); + } + + if ( !$result ) { + throw new MWException( 'Parameters passed to ' . __METHOD__ . ' result in invalid XML' ); + } + + $root = $dom->documentElement; + $node = new PPNode_DOM( $root->childNodes ); + return $node; + } + + /** + * @throws MWException + * @return bool + */ + public function memCheck() { + if ( $this->memoryLimit === false ) { + return true; + } + $usage = memory_get_usage(); + if ( $usage > $this->memoryLimit * 0.9 ) { + $limit = intval( $this->memoryLimit * 0.9 / 1048576 + 0.5 ); + throw new MWException( "Preprocessor hit 90% memory limit ($limit MB)" ); + } + return $usage <= $this->memoryLimit * 0.8; + } + + /** + * Preprocess some wikitext and return the document tree. + * This is the ghost of Parser::replace_variables(). + * + * @param string $text The text to parse + * @param int $flags Bitwise combination of: + * Parser::PTD_FOR_INCLUSION Handle "<noinclude>" and "<includeonly>" + * as if the text is being included. Default + * is to assume a direct page view. + * + * The generated DOM tree must depend only on the input text and the flags. + * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of T6899. + * + * Any flag added to the $flags parameter here, or any other parameter liable to cause a + * change in the DOM tree for a given text, must be passed through the section identifier + * in the section edit link and thus back to extractSections(). + * + * The output of this function is currently only cached in process memory, but a persistent + * cache may be implemented at a later date which takes further advantage of these strict + * dependency requirements. + * + * @throws MWException + * @return PPNode_DOM + */ + public function preprocessToObj( $text, $flags = 0 ) { + $xml = $this->cacheGetTree( $text, $flags ); + if ( $xml === false ) { + $xml = $this->preprocessToXml( $text, $flags ); + $this->cacheSetTree( $text, $flags, $xml ); + } + + // Fail if the number of elements exceeds acceptable limits + // Do not attempt to generate the DOM + $this->parser->mGeneratedPPNodeCount += substr_count( $xml, '<' ); + $max = $this->parser->mOptions->getMaxGeneratedPPNodeCount(); + if ( $this->parser->mGeneratedPPNodeCount > $max ) { + // if ( $cacheable ) { ... } + throw new MWException( __METHOD__ . ': generated node count limit exceeded' ); + } + + $dom = new DOMDocument; + Wikimedia\suppressWarnings(); + $result = $dom->loadXML( $xml ); + Wikimedia\restoreWarnings(); + if ( !$result ) { + // Try running the XML through UtfNormal to get rid of invalid characters + $xml = UtfNormal\Validator::cleanUp( $xml ); + // 1 << 19 == XML_PARSE_HUGE, needed so newer versions of libxml2 + // don't barf when the XML is >256 levels deep. + $result = $dom->loadXML( $xml, 1 << 19 ); + } + if ( $result ) { + $obj = new PPNode_DOM( $dom->documentElement ); + } + + // if ( $cacheable ) { ... } + + if ( !$result ) { + throw new MWException( __METHOD__ . ' generated invalid XML' ); + } + return $obj; + } + + /** + * @param string $text + * @param int $flags + * @return string + */ + public function preprocessToXml( $text, $flags = 0 ) { + global $wgDisableLangConversion; + + $forInclusion = $flags & Parser::PTD_FOR_INCLUSION; + + $xmlishElements = $this->parser->getStripList(); + $xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ]; + $enableOnlyinclude = false; + if ( $forInclusion ) { + $ignoredTags = [ 'includeonly', '/includeonly' ]; + $ignoredElements = [ 'noinclude' ]; + $xmlishElements[] = 'noinclude'; + if ( strpos( $text, '<onlyinclude>' ) !== false + && strpos( $text, '</onlyinclude>' ) !== false + ) { + $enableOnlyinclude = true; + } + } else { + $ignoredTags = [ 'noinclude', '/noinclude', 'onlyinclude', '/onlyinclude' ]; + $ignoredElements = [ 'includeonly' ]; + $xmlishElements[] = 'includeonly'; + } + $xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) ); + + // Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset + $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA"; + + $stack = new PPDStack; + + $searchBase = "[{<\n"; # } + if ( !$wgDisableLangConversion ) { + $searchBase .= '-'; + } + + // For fast reverse searches + $revText = strrev( $text ); + $lengthText = strlen( $text ); + + // Input pointer, starts out pointing to a pseudo-newline before the start + $i = 0; + // Current accumulator + $accum =& $stack->getAccum(); + $accum = '<root>'; + // True to find equals signs in arguments + $findEquals = false; + // True to take notice of pipe characters + $findPipe = false; + $headingIndex = 1; + // True if $i is inside a possible heading + $inHeading = false; + // True if there are no more greater-than (>) signs right of $i + $noMoreGT = false; + // Map of tag name => true if there are no more closing tags of given type right of $i + $noMoreClosingTag = []; + // True to ignore all input up to the next <onlyinclude> + $findOnlyinclude = $enableOnlyinclude; + // Do a line-start run without outputting an LF character + $fakeLineStart = true; + + while ( true ) { + // $this->memCheck(); + + if ( $findOnlyinclude ) { + // Ignore all input up to the next <onlyinclude> + $startPos = strpos( $text, '<onlyinclude>', $i ); + if ( $startPos === false ) { + // Ignored section runs to the end + $accum .= '<ignore>' . htmlspecialchars( substr( $text, $i ) ) . '</ignore>'; + break; + } + $tagEndPos = $startPos + strlen( '<onlyinclude>' ); // past-the-end + $accum .= '<ignore>' . htmlspecialchars( substr( $text, $i, $tagEndPos - $i ) ) . '</ignore>'; + $i = $tagEndPos; + $findOnlyinclude = false; + } + + if ( $fakeLineStart ) { + $found = 'line-start'; + $curChar = ''; + } else { + # Find next opening brace, closing brace or pipe + $search = $searchBase; + if ( $stack->top === false ) { + $currentClosing = ''; + } else { + $currentClosing = $stack->top->close; + $search .= $currentClosing; + } + if ( $findPipe ) { + $search .= '|'; + } + if ( $findEquals ) { + // First equals will be for the template + $search .= '='; + } + $rule = null; + # Output literal section, advance input counter + $literalLength = strcspn( $text, $search, $i ); + if ( $literalLength > 0 ) { + $accum .= htmlspecialchars( substr( $text, $i, $literalLength ) ); + $i += $literalLength; + } + if ( $i >= $lengthText ) { + if ( $currentClosing == "\n" ) { + // Do a past-the-end run to finish off the heading + $curChar = ''; + $found = 'line-end'; + } else { + # All done + break; + } + } else { + $curChar = $curTwoChar = $text[$i]; + if ( ( $i + 1 ) < $lengthText ) { + $curTwoChar .= $text[$i + 1]; + } + if ( $curChar == '|' ) { + $found = 'pipe'; + } elseif ( $curChar == '=' ) { + $found = 'equals'; + } elseif ( $curChar == '<' ) { + $found = 'angle'; + } elseif ( $curChar == "\n" ) { + if ( $inHeading ) { + $found = 'line-end'; + } else { + $found = 'line-start'; + } + } elseif ( $curTwoChar == $currentClosing ) { + $found = 'close'; + $curChar = $curTwoChar; + } elseif ( $curChar == $currentClosing ) { + $found = 'close'; + } elseif ( isset( $this->rules[$curTwoChar] ) ) { + $curChar = $curTwoChar; + $found = 'open'; + $rule = $this->rules[$curChar]; + } elseif ( isset( $this->rules[$curChar] ) ) { + $found = 'open'; + $rule = $this->rules[$curChar]; + } else { + # Some versions of PHP have a strcspn which stops on + # null characters; ignore these and continue. + # We also may get '-' and '}' characters here which + # don't match -{ or $currentClosing. Add these to + # output and continue. + if ( $curChar == '-' || $curChar == '}' ) { + $accum .= $curChar; + } + ++$i; + continue; + } + } + } + + if ( $found == 'angle' ) { + $matches = false; + // Handle </onlyinclude> + if ( $enableOnlyinclude + && substr( $text, $i, strlen( '</onlyinclude>' ) ) == '</onlyinclude>' + ) { + $findOnlyinclude = true; + continue; + } + + // Determine element name + if ( !preg_match( $elementsRegex, $text, $matches, 0, $i + 1 ) ) { + // Element name missing or not listed + $accum .= '<'; + ++$i; + continue; + } + // Handle comments + if ( isset( $matches[2] ) && $matches[2] == '!--' ) { + // To avoid leaving blank lines, when a sequence of + // space-separated comments is both preceded and followed by + // a newline (ignoring spaces), then + // trim leading and trailing spaces and the trailing newline. + + // Find the end + $endPos = strpos( $text, '-->', $i + 4 ); + if ( $endPos === false ) { + // Unclosed comment in input, runs to end + $inner = substr( $text, $i ); + $accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>'; + $i = $lengthText; + } else { + // Search backwards for leading whitespace + $wsStart = $i ? ( $i - strspn( $revText, " \t", $lengthText - $i ) ) : 0; + + // Search forwards for trailing whitespace + // $wsEnd will be the position of the last space (or the '>' if there's none) + $wsEnd = $endPos + 2 + strspn( $text, " \t", $endPos + 3 ); + + // Keep looking forward as long as we're finding more + // comments. + $comments = [ [ $wsStart, $wsEnd ] ]; + while ( substr( $text, $wsEnd + 1, 4 ) == '<!--' ) { + $c = strpos( $text, '-->', $wsEnd + 4 ); + if ( $c === false ) { + break; + } + $c = $c + 2 + strspn( $text, " \t", $c + 3 ); + $comments[] = [ $wsEnd + 1, $c ]; + $wsEnd = $c; + } + + // Eat the line if possible + // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at + // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but + // it's a possible beneficial b/c break. + if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n" + && substr( $text, $wsEnd + 1, 1 ) == "\n" + ) { + // Remove leading whitespace from the end of the accumulator + // Sanity check first though + $wsLength = $i - $wsStart; + if ( $wsLength > 0 + && strspn( $accum, " \t", -$wsLength ) === $wsLength + ) { + $accum = substr( $accum, 0, -$wsLength ); + } + + // Dump all but the last comment to the accumulator + foreach ( $comments as $j => $com ) { + $startPos = $com[0]; + $endPos = $com[1] + 1; + if ( $j == ( count( $comments ) - 1 ) ) { + break; + } + $inner = substr( $text, $startPos, $endPos - $startPos ); + $accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>'; + } + + // Do a line-start run next time to look for headings after the comment + $fakeLineStart = true; + } else { + // No line to eat, just take the comment itself + $startPos = $i; + $endPos += 2; + } + + if ( $stack->top ) { + $part = $stack->top->getCurrentPart(); + if ( !( isset( $part->commentEnd ) && $part->commentEnd == $wsStart - 1 ) ) { + $part->visualEnd = $wsStart; + } + // Else comments abutting, no change in visual end + $part->commentEnd = $endPos; + } + $i = $endPos + 1; + $inner = substr( $text, $startPos, $endPos - $startPos + 1 ); + $accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>'; + } + continue; + } + $name = $matches[1]; + $lowerName = strtolower( $name ); + $attrStart = $i + strlen( $name ) + 1; + + // Find end of tag + $tagEndPos = $noMoreGT ? false : strpos( $text, '>', $attrStart ); + if ( $tagEndPos === false ) { + // Infinite backtrack + // Disable tag search to prevent worst-case O(N^2) performance + $noMoreGT = true; + $accum .= '<'; + ++$i; + continue; + } + + // Handle ignored tags + if ( in_array( $lowerName, $ignoredTags ) ) { + $accum .= '<ignore>' + . htmlspecialchars( substr( $text, $i, $tagEndPos - $i + 1 ) ) + . '</ignore>'; + $i = $tagEndPos + 1; + continue; + } + + $tagStartPos = $i; + if ( $text[$tagEndPos - 1] == '/' ) { + $attrEnd = $tagEndPos - 1; + $inner = null; + $i = $tagEndPos + 1; + $close = ''; + } else { + $attrEnd = $tagEndPos; + // Find closing tag + if ( + !isset( $noMoreClosingTag[$name] ) && + preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i", + $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 ) + ) { + $inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 ); + $i = $matches[0][1] + strlen( $matches[0][0] ); + $close = '<close>' . htmlspecialchars( $matches[0][0] ) . '</close>'; + } else { + // No end tag + if ( in_array( $name, $xmlishAllowMissingEndTag ) ) { + // Let it run out to the end of the text. + $inner = substr( $text, $tagEndPos + 1 ); + $i = $lengthText; + $close = ''; + } else { + // Don't match the tag, treat opening tag as literal and resume parsing. + $i = $tagEndPos + 1; + $accum .= htmlspecialchars( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) ); + // Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>... + $noMoreClosingTag[$name] = true; + continue; + } + } + } + // <includeonly> and <noinclude> just become <ignore> tags + if ( in_array( $lowerName, $ignoredElements ) ) { + $accum .= '<ignore>' . htmlspecialchars( substr( $text, $tagStartPos, $i - $tagStartPos ) ) + . '</ignore>'; + continue; + } + + $accum .= '<ext>'; + if ( $attrEnd <= $attrStart ) { + $attr = ''; + } else { + $attr = substr( $text, $attrStart, $attrEnd - $attrStart ); + } + $accum .= '<name>' . htmlspecialchars( $name ) . '</name>' . + // Note that the attr element contains the whitespace between name and attribute, + // this is necessary for precise reconstruction during pre-save transform. + '<attr>' . htmlspecialchars( $attr ) . '</attr>'; + if ( $inner !== null ) { + $accum .= '<inner>' . htmlspecialchars( $inner ) . '</inner>'; + } + $accum .= $close . '</ext>'; + } elseif ( $found == 'line-start' ) { + // Is this the start of a heading? + // Line break belongs before the heading element in any case + if ( $fakeLineStart ) { + $fakeLineStart = false; + } else { + $accum .= $curChar; + $i++; + } + + $count = strspn( $text, '=', $i, 6 ); + if ( $count == 1 && $findEquals ) { + // DWIM: This looks kind of like a name/value separator. + // Let's let the equals handler have it and break the + // potential heading. This is heuristic, but AFAICT the + // methods for completely correct disambiguation are very + // complex. + } elseif ( $count > 0 ) { + $piece = [ + 'open' => "\n", + 'close' => "\n", + 'parts' => [ new PPDPart( str_repeat( '=', $count ) ) ], + 'startPos' => $i, + 'count' => $count ]; + $stack->push( $piece ); + $accum =& $stack->getAccum(); + $stackFlags = $stack->getFlags(); + if ( isset( $stackFlags['findEquals'] ) ) { + $findEquals = $stackFlags['findEquals']; + } + if ( isset( $stackFlags['findPipe'] ) ) { + $findPipe = $stackFlags['findPipe']; + } + if ( isset( $stackFlags['inHeading'] ) ) { + $inHeading = $stackFlags['inHeading']; + } + $i += $count; + } + } elseif ( $found == 'line-end' ) { + $piece = $stack->top; + // A heading must be open, otherwise \n wouldn't have been in the search list + assert( $piece->open === "\n" ); + $part = $piece->getCurrentPart(); + // Search back through the input to see if it has a proper close. + // Do this using the reversed string since the other solutions + // (end anchor, etc.) are inefficient. + $wsLength = strspn( $revText, " \t", $lengthText - $i ); + $searchStart = $i - $wsLength; + if ( isset( $part->commentEnd ) && $searchStart - 1 == $part->commentEnd ) { + // Comment found at line end + // Search for equals signs before the comment + $searchStart = $part->visualEnd; + $searchStart -= strspn( $revText, " \t", $lengthText - $searchStart ); + } + $count = $piece->count; + $equalsLength = strspn( $revText, '=', $lengthText - $searchStart ); + if ( $equalsLength > 0 ) { + if ( $searchStart - $equalsLength == $piece->startPos ) { + // This is just a single string of equals signs on its own line + // Replicate the doHeadings behavior /={count}(.+)={count}/ + // First find out how many equals signs there really are (don't stop at 6) + $count = $equalsLength; + if ( $count < 3 ) { + $count = 0; + } else { + $count = min( 6, intval( ( $count - 1 ) / 2 ) ); + } + } else { + $count = min( $equalsLength, $count ); + } + if ( $count > 0 ) { + // Normal match, output <h> + $element = "<h level=\"$count\" i=\"$headingIndex\">$accum</h>"; + $headingIndex++; + } else { + // Single equals sign on its own line, count=0 + $element = $accum; + } + } else { + // No match, no <h>, just pass down the inner text + $element = $accum; + } + // Unwind the stack + $stack->pop(); + $accum =& $stack->getAccum(); + $stackFlags = $stack->getFlags(); + if ( isset( $stackFlags['findEquals'] ) ) { + $findEquals = $stackFlags['findEquals']; + } + if ( isset( $stackFlags['findPipe'] ) ) { + $findPipe = $stackFlags['findPipe']; + } + if ( isset( $stackFlags['inHeading'] ) ) { + $inHeading = $stackFlags['inHeading']; + } + + // Append the result to the enclosing accumulator + $accum .= $element; + // Note that we do NOT increment the input pointer. + // This is because the closing linebreak could be the opening linebreak of + // another heading. Infinite loops are avoided because the next iteration MUST + // hit the heading open case above, which unconditionally increments the + // input pointer. + } elseif ( $found == 'open' ) { + # count opening brace characters + $curLen = strlen( $curChar ); + $count = ( $curLen > 1 ) ? + # allow the final character to repeat + strspn( $text, $curChar[$curLen - 1], $i + 1 ) + 1 : + strspn( $text, $curChar, $i ); + + $savedPrefix = ''; + $lineStart = ( $i > 0 && $text[$i - 1] == "\n" ); + + if ( $curChar === "-{" && $count > $curLen ) { + // -{ => {{ transition because rightmost wins + $savedPrefix = '-'; + $i++; + $curChar = '{'; + $count--; + $rule = $this->rules[$curChar]; + } + + # we need to add to stack only if opening brace count is enough for one of the rules + if ( $count >= $rule['min'] ) { + # Add it to the stack + $piece = [ + 'open' => $curChar, + 'close' => $rule['end'], + 'savedPrefix' => $savedPrefix, + 'count' => $count, + 'lineStart' => $lineStart, + ]; + + $stack->push( $piece ); + $accum =& $stack->getAccum(); + $stackFlags = $stack->getFlags(); + if ( isset( $stackFlags['findEquals'] ) ) { + $findEquals = $stackFlags['findEquals']; + } + if ( isset( $stackFlags['findPipe'] ) ) { + $findPipe = $stackFlags['findPipe']; + } + if ( isset( $stackFlags['inHeading'] ) ) { + $inHeading = $stackFlags['inHeading']; + } + } else { + # Add literal brace(s) + $accum .= htmlspecialchars( $savedPrefix . str_repeat( $curChar, $count ) ); + } + $i += $count; + } elseif ( $found == 'close' ) { + $piece = $stack->top; + # lets check if there are enough characters for closing brace + $maxCount = $piece->count; + if ( $piece->close === '}-' && $curChar === '}' ) { + $maxCount--; # don't try to match closing '-' as a '}' + } + $curLen = strlen( $curChar ); + $count = ( $curLen > 1 ) ? $curLen : + strspn( $text, $curChar, $i, $maxCount ); + + # check for maximum matching characters (if there are 5 closing + # characters, we will probably need only 3 - depending on the rules) + $rule = $this->rules[$piece->open]; + if ( $count > $rule['max'] ) { + # The specified maximum exists in the callback array, unless the caller + # has made an error + $matchingCount = $rule['max']; + } else { + # Count is less than the maximum + # Skip any gaps in the callback array to find the true largest match + # Need to use array_key_exists not isset because the callback can be null + $matchingCount = $count; + while ( $matchingCount > 0 && !array_key_exists( $matchingCount, $rule['names'] ) ) { + --$matchingCount; + } + } + + if ( $matchingCount <= 0 ) { + # No matching element found in callback array + # Output a literal closing brace and continue + $endText = substr( $text, $i, $count ); + $accum .= htmlspecialchars( $endText ); + $i += $count; + continue; + } + $name = $rule['names'][$matchingCount]; + if ( $name === null ) { + // No element, just literal text + $endText = substr( $text, $i, $matchingCount ); + $element = $piece->breakSyntax( $matchingCount ) . $endText; + } else { + # Create XML element + # Note: $parts is already XML, does not need to be encoded further + $parts = $piece->parts; + $title = $parts[0]->out; + unset( $parts[0] ); + + # The invocation is at the start of the line if lineStart is set in + # the stack, and all opening brackets are used up. + if ( $maxCount == $matchingCount && + !empty( $piece->lineStart ) && + strlen( $piece->savedPrefix ) == 0 ) { + $attr = ' lineStart="1"'; + } else { + $attr = ''; + } + + $element = "<$name$attr>"; + $element .= "<title>$title</title>"; + $argIndex = 1; + foreach ( $parts as $part ) { + if ( isset( $part->eqpos ) ) { + $argName = substr( $part->out, 0, $part->eqpos ); + $argValue = substr( $part->out, $part->eqpos + 1 ); + $element .= "<part><name>$argName</name>=<value>$argValue</value></part>"; + } else { + $element .= "<part><name index=\"$argIndex\" /><value>{$part->out}</value></part>"; + $argIndex++; + } + } + $element .= "</$name>"; + } + + # Advance input pointer + $i += $matchingCount; + + # Unwind the stack + $stack->pop(); + $accum =& $stack->getAccum(); + + # Re-add the old stack element if it still has unmatched opening characters remaining + if ( $matchingCount < $piece->count ) { + $piece->parts = [ new PPDPart ]; + $piece->count -= $matchingCount; + # do we still qualify for any callback with remaining count? + $min = $this->rules[$piece->open]['min']; + if ( $piece->count >= $min ) { + $stack->push( $piece ); + $accum =& $stack->getAccum(); + } elseif ( $piece->count == 1 && $piece->open === '{' && $piece->savedPrefix === '-' ) { + $piece->savedPrefix = ''; + $piece->open = '-{'; + $piece->count = 2; + $piece->close = $this->rules[$piece->open]['end']; + $stack->push( $piece ); + $accum =& $stack->getAccum(); + } else { + $s = substr( $piece->open, 0, -1 ); + $s .= str_repeat( + substr( $piece->open, -1 ), + $piece->count - strlen( $s ) + ); + $accum .= $piece->savedPrefix . $s; + } + } elseif ( $piece->savedPrefix !== '' ) { + $accum .= $piece->savedPrefix; + } + + $stackFlags = $stack->getFlags(); + if ( isset( $stackFlags['findEquals'] ) ) { + $findEquals = $stackFlags['findEquals']; + } + if ( isset( $stackFlags['findPipe'] ) ) { + $findPipe = $stackFlags['findPipe']; + } + if ( isset( $stackFlags['inHeading'] ) ) { + $inHeading = $stackFlags['inHeading']; + } + + # Add XML element to the enclosing accumulator + $accum .= $element; + } elseif ( $found == 'pipe' ) { + $findEquals = true; // shortcut for getFlags() + $stack->addPart(); + $accum =& $stack->getAccum(); + ++$i; + } elseif ( $found == 'equals' ) { + $findEquals = false; // shortcut for getFlags() + $stack->getCurrentPart()->eqpos = strlen( $accum ); + $accum .= '='; + ++$i; + } + } + + # Output any remaining unclosed brackets + foreach ( $stack->stack as $piece ) { + $stack->rootAccum .= $piece->breakSyntax(); + } + $stack->rootAccum .= '</root>'; + $xml = $stack->rootAccum; + + return $xml; + } +} + +/** + * Stack class to help Preprocessor::preprocessToObj() + * @ingroup Parser + */ +class PPDStack { + public $stack, $rootAccum; + + /** + * @var PPDStack + */ + public $top; + public $out; + public $elementClass = PPDStackElement::class; + + public static $false = false; + + public function __construct() { + $this->stack = []; + $this->top = false; + $this->rootAccum = ''; + $this->accum =& $this->rootAccum; + } + + /** + * @return int + */ + public function count() { + return count( $this->stack ); + } + + public function &getAccum() { + return $this->accum; + } + + public function getCurrentPart() { + if ( $this->top === false ) { + return false; + } else { + return $this->top->getCurrentPart(); + } + } + + public function push( $data ) { + if ( $data instanceof $this->elementClass ) { + $this->stack[] = $data; + } else { + $class = $this->elementClass; + $this->stack[] = new $class( $data ); + } + $this->top = $this->stack[count( $this->stack ) - 1]; + $this->accum =& $this->top->getAccum(); + } + + public function pop() { + if ( !count( $this->stack ) ) { + throw new MWException( __METHOD__ . ': no elements remaining' ); + } + $temp = array_pop( $this->stack ); + + if ( count( $this->stack ) ) { + $this->top = $this->stack[count( $this->stack ) - 1]; + $this->accum =& $this->top->getAccum(); + } else { + $this->top = self::$false; + $this->accum =& $this->rootAccum; + } + return $temp; + } + + public function addPart( $s = '' ) { + $this->top->addPart( $s ); + $this->accum =& $this->top->getAccum(); + } + + /** + * @return array + */ + public function getFlags() { + if ( !count( $this->stack ) ) { + return [ + 'findEquals' => false, + 'findPipe' => false, + 'inHeading' => false, + ]; + } else { + return $this->top->getFlags(); + } + } +} + +/** + * @ingroup Parser + */ +class PPDStackElement { + /** + * @var string Opening character (\n for heading) + */ + public $open; + + /** + * @var string Matching closing character + */ + public $close; + + /** + * @var string Saved prefix that may affect later processing, + * e.g. to differentiate `-{{{{` and `{{{{` after later seeing `}}}`. + */ + public $savedPrefix = ''; + + /** + * @var int Number of opening characters found (number of "=" for heading) + */ + public $count; + + /** + * @var PPDPart[] Array of PPDPart objects describing pipe-separated parts. + */ + public $parts; + + /** + * @var bool True if the open char appeared at the start of the input line. + * Not set for headings. + */ + public $lineStart; + + public $partClass = PPDPart::class; + + public function __construct( $data = [] ) { + $class = $this->partClass; + $this->parts = [ new $class ]; + + foreach ( $data as $name => $value ) { + $this->$name = $value; + } + } + + public function &getAccum() { + return $this->parts[count( $this->parts ) - 1]->out; + } + + public function addPart( $s = '' ) { + $class = $this->partClass; + $this->parts[] = new $class( $s ); + } + + public function getCurrentPart() { + return $this->parts[count( $this->parts ) - 1]; + } + + /** + * @return array + */ + public function getFlags() { + $partCount = count( $this->parts ); + $findPipe = $this->open != "\n" && $this->open != '['; + return [ + 'findPipe' => $findPipe, + 'findEquals' => $findPipe && $partCount > 1 && !isset( $this->parts[$partCount - 1]->eqpos ), + 'inHeading' => $this->open == "\n", + ]; + } + + /** + * Get the output string that would result if the close is not found. + * + * @param bool|int $openingCount + * @return string + */ + public function breakSyntax( $openingCount = false ) { + if ( $this->open == "\n" ) { + $s = $this->savedPrefix . $this->parts[0]->out; + } else { + if ( $openingCount === false ) { + $openingCount = $this->count; + } + $s = substr( $this->open, 0, -1 ); + $s .= str_repeat( + substr( $this->open, -1 ), + $openingCount - strlen( $s ) + ); + $s = $this->savedPrefix . $s; + $first = true; + foreach ( $this->parts as $part ) { + if ( $first ) { + $first = false; + } else { + $s .= '|'; + } + $s .= $part->out; + } + } + return $s; + } +} + +/** + * @ingroup Parser + */ +class PPDPart { + /** + * @var string Output accumulator string + */ + public $out; + + // Optional member variables: + // eqpos Position of equals sign in output accumulator + // commentEnd Past-the-end input pointer for the last comment encountered + // visualEnd Past-the-end input pointer for the end of the accumulator minus comments + + public function __construct( $out = '' ) { + $this->out = $out; + } +} + +/** + * An expansion frame, used as a context to expand the result of preprocessToObj() + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPFrame_DOM implements PPFrame { + + /** + * @var Preprocessor + */ + public $preprocessor; + + /** + * @var Parser + */ + public $parser; + + /** + * @var Title + */ + public $title; + public $titleCache; + + /** + * Hashtable listing templates which are disallowed for expansion in this frame, + * having been encountered previously in parent frames. + */ + public $loopCheckHash; + + /** + * Recursion depth of this frame, top = 0 + * Note that this is NOT the same as expansion depth in expand() + */ + public $depth; + + private $volatile = false; + private $ttl = null; + + /** + * @var array + */ + protected $childExpansionCache; + + /** + * Construct a new preprocessor frame. + * @param Preprocessor $preprocessor The parent preprocessor + */ + public function __construct( $preprocessor ) { + $this->preprocessor = $preprocessor; + $this->parser = $preprocessor->parser; + $this->title = $this->parser->mTitle; + $this->titleCache = [ $this->title ? $this->title->getPrefixedDBkey() : false ]; + $this->loopCheckHash = []; + $this->depth = 0; + $this->childExpansionCache = []; + } + + /** + * Create a new child frame + * $args is optionally a multi-root PPNode or array containing the template arguments + * + * @param bool|array $args + * @param Title|bool $title + * @param int $indexOffset + * @return PPTemplateFrame_DOM + */ + public function newChild( $args = false, $title = false, $indexOffset = 0 ) { + $namedArgs = []; + $numberedArgs = []; + if ( $title === false ) { + $title = $this->title; + } + if ( $args !== false ) { + $xpath = false; + if ( $args instanceof PPNode ) { + $args = $args->node; + } + foreach ( $args as $arg ) { + if ( $arg instanceof PPNode ) { + $arg = $arg->node; + } + if ( !$xpath || $xpath->document !== $arg->ownerDocument ) { + $xpath = new DOMXPath( $arg->ownerDocument ); + } + + $nameNodes = $xpath->query( 'name', $arg ); + $value = $xpath->query( 'value', $arg ); + if ( $nameNodes->item( 0 )->hasAttributes() ) { + // Numbered parameter + $index = $nameNodes->item( 0 )->attributes->getNamedItem( 'index' )->textContent; + $index = $index - $indexOffset; + if ( isset( $namedArgs[$index] ) || isset( $numberedArgs[$index] ) ) { + $this->parser->getOutput()->addWarning( wfMessage( 'duplicate-args-warning', + wfEscapeWikiText( $this->title ), + wfEscapeWikiText( $title ), + wfEscapeWikiText( $index ) )->text() ); + $this->parser->addTrackingCategory( 'duplicate-args-category' ); + } + $numberedArgs[$index] = $value->item( 0 ); + unset( $namedArgs[$index] ); + } else { + // Named parameter + $name = trim( $this->expand( $nameNodes->item( 0 ), PPFrame::STRIP_COMMENTS ) ); + if ( isset( $namedArgs[$name] ) || isset( $numberedArgs[$name] ) ) { + $this->parser->getOutput()->addWarning( wfMessage( 'duplicate-args-warning', + wfEscapeWikiText( $this->title ), + wfEscapeWikiText( $title ), + wfEscapeWikiText( $name ) )->text() ); + $this->parser->addTrackingCategory( 'duplicate-args-category' ); + } + $namedArgs[$name] = $value->item( 0 ); + unset( $numberedArgs[$name] ); + } + } + } + return new PPTemplateFrame_DOM( $this->preprocessor, $this, $numberedArgs, $namedArgs, $title ); + } + + /** + * @throws MWException + * @param string|int $key + * @param string|PPNode_DOM|DOMDocument $root + * @param int $flags + * @return string + */ + public function cachedExpand( $key, $root, $flags = 0 ) { + // we don't have a parent, so we don't have a cache + return $this->expand( $root, $flags ); + } + + /** + * @throws MWException + * @param string|PPNode_DOM|DOMDocument $root + * @param int $flags + * @return string + */ + public function expand( $root, $flags = 0 ) { + static $expansionDepth = 0; + if ( is_string( $root ) ) { + return $root; + } + + if ( ++$this->parser->mPPNodeCount > $this->parser->mOptions->getMaxPPNodeCount() ) { + $this->parser->limitationWarn( 'node-count-exceeded', + $this->parser->mPPNodeCount, + $this->parser->mOptions->getMaxPPNodeCount() + ); + return '<span class="error">Node-count limit exceeded</span>'; + } + + if ( $expansionDepth > $this->parser->mOptions->getMaxPPExpandDepth() ) { + $this->parser->limitationWarn( 'expansion-depth-exceeded', + $expansionDepth, + $this->parser->mOptions->getMaxPPExpandDepth() + ); + return '<span class="error">Expansion depth limit exceeded</span>'; + } + ++$expansionDepth; + if ( $expansionDepth > $this->parser->mHighestExpansionDepth ) { + $this->parser->mHighestExpansionDepth = $expansionDepth; + } + + if ( $root instanceof PPNode_DOM ) { + $root = $root->node; + } + if ( $root instanceof DOMDocument ) { + $root = $root->documentElement; + } + + $outStack = [ '', '' ]; + $iteratorStack = [ false, $root ]; + $indexStack = [ 0, 0 ]; + + while ( count( $iteratorStack ) > 1 ) { + $level = count( $outStack ) - 1; + $iteratorNode =& $iteratorStack[$level]; + $out =& $outStack[$level]; + $index =& $indexStack[$level]; + + if ( $iteratorNode instanceof PPNode_DOM ) { + $iteratorNode = $iteratorNode->node; + } + + if ( is_array( $iteratorNode ) ) { + if ( $index >= count( $iteratorNode ) ) { + // All done with this iterator + $iteratorStack[$level] = false; + $contextNode = false; + } else { + $contextNode = $iteratorNode[$index]; + $index++; + } + } elseif ( $iteratorNode instanceof DOMNodeList ) { + if ( $index >= $iteratorNode->length ) { + // All done with this iterator + $iteratorStack[$level] = false; + $contextNode = false; + } else { + $contextNode = $iteratorNode->item( $index ); + $index++; + } + } else { + // Copy to $contextNode and then delete from iterator stack, + // because this is not an iterator but we do have to execute it once + $contextNode = $iteratorStack[$level]; + $iteratorStack[$level] = false; + } + + if ( $contextNode instanceof PPNode_DOM ) { + $contextNode = $contextNode->node; + } + + $newIterator = false; + + if ( $contextNode === false ) { + // nothing to do + } elseif ( is_string( $contextNode ) ) { + $out .= $contextNode; + } elseif ( is_array( $contextNode ) || $contextNode instanceof DOMNodeList ) { + $newIterator = $contextNode; + } elseif ( $contextNode instanceof DOMNode ) { + if ( $contextNode->nodeType == XML_TEXT_NODE ) { + $out .= $contextNode->nodeValue; + } elseif ( $contextNode->nodeName == 'template' ) { + # Double-brace expansion + $xpath = new DOMXPath( $contextNode->ownerDocument ); + $titles = $xpath->query( 'title', $contextNode ); + $title = $titles->item( 0 ); + $parts = $xpath->query( 'part', $contextNode ); + if ( $flags & PPFrame::NO_TEMPLATES ) { + $newIterator = $this->virtualBracketedImplode( '{{', '|', '}}', $title, $parts ); + } else { + $lineStart = $contextNode->getAttribute( 'lineStart' ); + $params = [ + 'title' => new PPNode_DOM( $title ), + 'parts' => new PPNode_DOM( $parts ), + 'lineStart' => $lineStart ]; + $ret = $this->parser->braceSubstitution( $params, $this ); + if ( isset( $ret['object'] ) ) { + $newIterator = $ret['object']; + } else { + $out .= $ret['text']; + } + } + } elseif ( $contextNode->nodeName == 'tplarg' ) { + # Triple-brace expansion + $xpath = new DOMXPath( $contextNode->ownerDocument ); + $titles = $xpath->query( 'title', $contextNode ); + $title = $titles->item( 0 ); + $parts = $xpath->query( 'part', $contextNode ); + if ( $flags & PPFrame::NO_ARGS ) { + $newIterator = $this->virtualBracketedImplode( '{{{', '|', '}}}', $title, $parts ); + } else { + $params = [ + 'title' => new PPNode_DOM( $title ), + 'parts' => new PPNode_DOM( $parts ) ]; + $ret = $this->parser->argSubstitution( $params, $this ); + if ( isset( $ret['object'] ) ) { + $newIterator = $ret['object']; + } else { + $out .= $ret['text']; + } + } + } elseif ( $contextNode->nodeName == 'comment' ) { + # HTML-style comment + # Remove it in HTML, pre+remove and STRIP_COMMENTS modes + # Not in RECOVER_COMMENTS mode (msgnw) though. + if ( ( $this->parser->ot['html'] + || ( $this->parser->ot['pre'] && $this->parser->mOptions->getRemoveComments() ) + || ( $flags & PPFrame::STRIP_COMMENTS ) + ) && !( $flags & PPFrame::RECOVER_COMMENTS ) + ) { + $out .= ''; + } elseif ( $this->parser->ot['wiki'] && !( $flags & PPFrame::RECOVER_COMMENTS ) ) { + # Add a strip marker in PST mode so that pstPass2() can + # run some old-fashioned regexes on the result. + # Not in RECOVER_COMMENTS mode (extractSections) though. + $out .= $this->parser->insertStripItem( $contextNode->textContent ); + } else { + # Recover the literal comment in RECOVER_COMMENTS and pre+no-remove + $out .= $contextNode->textContent; + } + } elseif ( $contextNode->nodeName == 'ignore' ) { + # Output suppression used by <includeonly> etc. + # OT_WIKI will only respect <ignore> in substed templates. + # The other output types respect it unless NO_IGNORE is set. + # extractSections() sets NO_IGNORE and so never respects it. + if ( ( !isset( $this->parent ) && $this->parser->ot['wiki'] ) + || ( $flags & PPFrame::NO_IGNORE ) + ) { + $out .= $contextNode->textContent; + } else { + $out .= ''; + } + } elseif ( $contextNode->nodeName == 'ext' ) { + # Extension tag + $xpath = new DOMXPath( $contextNode->ownerDocument ); + $names = $xpath->query( 'name', $contextNode ); + $attrs = $xpath->query( 'attr', $contextNode ); + $inners = $xpath->query( 'inner', $contextNode ); + $closes = $xpath->query( 'close', $contextNode ); + if ( $flags & PPFrame::NO_TAGS ) { + $s = '<' . $this->expand( $names->item( 0 ), $flags ); + if ( $attrs->length > 0 ) { + $s .= $this->expand( $attrs->item( 0 ), $flags ); + } + if ( $inners->length > 0 ) { + $s .= '>' . $this->expand( $inners->item( 0 ), $flags ); + if ( $closes->length > 0 ) { + $s .= $this->expand( $closes->item( 0 ), $flags ); + } + } else { + $s .= '/>'; + } + $out .= $s; + } else { + $params = [ + 'name' => new PPNode_DOM( $names->item( 0 ) ), + 'attr' => $attrs->length > 0 ? new PPNode_DOM( $attrs->item( 0 ) ) : null, + 'inner' => $inners->length > 0 ? new PPNode_DOM( $inners->item( 0 ) ) : null, + 'close' => $closes->length > 0 ? new PPNode_DOM( $closes->item( 0 ) ) : null, + ]; + $out .= $this->parser->extensionSubstitution( $params, $this ); + } + } elseif ( $contextNode->nodeName == 'h' ) { + # Heading + $s = $this->expand( $contextNode->childNodes, $flags ); + + # Insert a heading marker only for <h> children of <root> + # This is to stop extractSections from going over multiple tree levels + if ( $contextNode->parentNode->nodeName == 'root' && $this->parser->ot['html'] ) { + # Insert heading index marker + $headingIndex = $contextNode->getAttribute( 'i' ); + $titleText = $this->title->getPrefixedDBkey(); + $this->parser->mHeadings[] = [ $titleText, $headingIndex ]; + $serial = count( $this->parser->mHeadings ) - 1; + $marker = Parser::MARKER_PREFIX . "-h-$serial-" . Parser::MARKER_SUFFIX; + $count = $contextNode->getAttribute( 'level' ); + $s = substr( $s, 0, $count ) . $marker . substr( $s, $count ); + $this->parser->mStripState->addGeneral( $marker, '' ); + } + $out .= $s; + } else { + # Generic recursive expansion + $newIterator = $contextNode->childNodes; + } + } else { + throw new MWException( __METHOD__ . ': Invalid parameter type' ); + } + + if ( $newIterator !== false ) { + if ( $newIterator instanceof PPNode_DOM ) { + $newIterator = $newIterator->node; + } + $outStack[] = ''; + $iteratorStack[] = $newIterator; + $indexStack[] = 0; + } elseif ( $iteratorStack[$level] === false ) { + // Return accumulated value to parent + // With tail recursion + while ( $iteratorStack[$level] === false && $level > 0 ) { + $outStack[$level - 1] .= $out; + array_pop( $outStack ); + array_pop( $iteratorStack ); + array_pop( $indexStack ); + $level--; + } + } + } + --$expansionDepth; + return $outStack[0]; + } + + /** + * @param string $sep + * @param int $flags + * @param string|PPNode_DOM|DOMDocument $args,... + * @return string + */ + public function implodeWithFlags( $sep, $flags /*, ... */ ) { + $args = array_slice( func_get_args(), 2 ); + + $first = true; + $s = ''; + foreach ( $args as $root ) { + if ( $root instanceof PPNode_DOM ) { + $root = $root->node; + } + if ( !is_array( $root ) && !( $root instanceof DOMNodeList ) ) { + $root = [ $root ]; + } + foreach ( $root as $node ) { + if ( $first ) { + $first = false; + } else { + $s .= $sep; + } + $s .= $this->expand( $node, $flags ); + } + } + return $s; + } + + /** + * Implode with no flags specified + * This previously called implodeWithFlags but has now been inlined to reduce stack depth + * + * @param string $sep + * @param string|PPNode_DOM|DOMDocument $args,... + * @return string + */ + public function implode( $sep /*, ... */ ) { + $args = array_slice( func_get_args(), 1 ); + + $first = true; + $s = ''; + foreach ( $args as $root ) { + if ( $root instanceof PPNode_DOM ) { + $root = $root->node; + } + if ( !is_array( $root ) && !( $root instanceof DOMNodeList ) ) { + $root = [ $root ]; + } + foreach ( $root as $node ) { + if ( $first ) { + $first = false; + } else { + $s .= $sep; + } + $s .= $this->expand( $node ); + } + } + return $s; + } + + /** + * Makes an object that, when expand()ed, will be the same as one obtained + * with implode() + * + * @param string $sep + * @param string|PPNode_DOM|DOMDocument $args,... + * @return array + */ + public function virtualImplode( $sep /*, ... */ ) { + $args = array_slice( func_get_args(), 1 ); + $out = []; + $first = true; + + foreach ( $args as $root ) { + if ( $root instanceof PPNode_DOM ) { + $root = $root->node; + } + if ( !is_array( $root ) && !( $root instanceof DOMNodeList ) ) { + $root = [ $root ]; + } + foreach ( $root as $node ) { + if ( $first ) { + $first = false; + } else { + $out[] = $sep; + } + $out[] = $node; + } + } + return $out; + } + + /** + * Virtual implode with brackets + * @param string $start + * @param string $sep + * @param string $end + * @param string|PPNode_DOM|DOMDocument $args,... + * @return array + */ + public function virtualBracketedImplode( $start, $sep, $end /*, ... */ ) { + $args = array_slice( func_get_args(), 3 ); + $out = [ $start ]; + $first = true; + + foreach ( $args as $root ) { + if ( $root instanceof PPNode_DOM ) { + $root = $root->node; + } + if ( !is_array( $root ) && !( $root instanceof DOMNodeList ) ) { + $root = [ $root ]; + } + foreach ( $root as $node ) { + if ( $first ) { + $first = false; + } else { + $out[] = $sep; + } + $out[] = $node; + } + } + $out[] = $end; + return $out; + } + + public function __toString() { + return 'frame{}'; + } + + public function getPDBK( $level = false ) { + if ( $level === false ) { + return $this->title->getPrefixedDBkey(); + } else { + return isset( $this->titleCache[$level] ) ? $this->titleCache[$level] : false; + } + } + + /** + * @return array + */ + public function getArguments() { + return []; + } + + /** + * @return array + */ + public function getNumberedArguments() { + return []; + } + + /** + * @return array + */ + public function getNamedArguments() { + return []; + } + + /** + * Returns true if there are no arguments in this frame + * + * @return bool + */ + public function isEmpty() { + return true; + } + + /** + * @param int|string $name + * @return bool Always false in this implementation. + */ + public function getArgument( $name ) { + return false; + } + + /** + * Returns true if the infinite loop check is OK, false if a loop is detected + * + * @param Title $title + * @return bool + */ + public function loopCheck( $title ) { + return !isset( $this->loopCheckHash[$title->getPrefixedDBkey()] ); + } + + /** + * Return true if the frame is a template frame + * + * @return bool + */ + public function isTemplate() { + return false; + } + + /** + * Get a title of frame + * + * @return Title + */ + public function getTitle() { + return $this->title; + } + + /** + * Set the volatile flag + * + * @param bool $flag + */ + public function setVolatile( $flag = true ) { + $this->volatile = $flag; + } + + /** + * Get the volatile flag + * + * @return bool + */ + public function isVolatile() { + return $this->volatile; + } + + /** + * Set the TTL + * + * @param int $ttl + */ + public function setTTL( $ttl ) { + if ( $ttl !== null && ( $this->ttl === null || $ttl < $this->ttl ) ) { + $this->ttl = $ttl; + } + } + + /** + * Get the TTL + * + * @return int|null + */ + public function getTTL() { + return $this->ttl; + } +} + +/** + * Expansion frame with template arguments + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPTemplateFrame_DOM extends PPFrame_DOM { + + public $numberedArgs, $namedArgs; + + /** + * @var PPFrame_DOM + */ + public $parent; + public $numberedExpansionCache, $namedExpansionCache; + + /** + * @param Preprocessor $preprocessor + * @param bool|PPFrame_DOM $parent + * @param array $numberedArgs + * @param array $namedArgs + * @param bool|Title $title + */ + public function __construct( $preprocessor, $parent = false, $numberedArgs = [], + $namedArgs = [], $title = false + ) { + parent::__construct( $preprocessor ); + + $this->parent = $parent; + $this->numberedArgs = $numberedArgs; + $this->namedArgs = $namedArgs; + $this->title = $title; + $pdbk = $title ? $title->getPrefixedDBkey() : false; + $this->titleCache = $parent->titleCache; + $this->titleCache[] = $pdbk; + $this->loopCheckHash = /*clone*/ $parent->loopCheckHash; + if ( $pdbk !== false ) { + $this->loopCheckHash[$pdbk] = true; + } + $this->depth = $parent->depth + 1; + $this->numberedExpansionCache = $this->namedExpansionCache = []; + } + + public function __toString() { + $s = 'tplframe{'; + $first = true; + $args = $this->numberedArgs + $this->namedArgs; + foreach ( $args as $name => $value ) { + if ( $first ) { + $first = false; + } else { + $s .= ', '; + } + $s .= "\"$name\":\"" . + str_replace( '"', '\\"', $value->ownerDocument->saveXML( $value ) ) . '"'; + } + $s .= '}'; + return $s; + } + + /** + * @throws MWException + * @param string|int $key + * @param string|PPNode_DOM|DOMDocument $root + * @param int $flags + * @return string + */ + public function cachedExpand( $key, $root, $flags = 0 ) { + if ( isset( $this->parent->childExpansionCache[$key] ) ) { + return $this->parent->childExpansionCache[$key]; + } + $retval = $this->expand( $root, $flags ); + if ( !$this->isVolatile() ) { + $this->parent->childExpansionCache[$key] = $retval; + } + return $retval; + } + + /** + * Returns true if there are no arguments in this frame + * + * @return bool + */ + public function isEmpty() { + return !count( $this->numberedArgs ) && !count( $this->namedArgs ); + } + + public function getArguments() { + $arguments = []; + foreach ( array_merge( + array_keys( $this->numberedArgs ), + array_keys( $this->namedArgs ) ) as $key ) { + $arguments[$key] = $this->getArgument( $key ); + } + return $arguments; + } + + public function getNumberedArguments() { + $arguments = []; + foreach ( array_keys( $this->numberedArgs ) as $key ) { + $arguments[$key] = $this->getArgument( $key ); + } + return $arguments; + } + + public function getNamedArguments() { + $arguments = []; + foreach ( array_keys( $this->namedArgs ) as $key ) { + $arguments[$key] = $this->getArgument( $key ); + } + return $arguments; + } + + /** + * @param int $index + * @return string|bool + */ + public function getNumberedArgument( $index ) { + if ( !isset( $this->numberedArgs[$index] ) ) { + return false; + } + if ( !isset( $this->numberedExpansionCache[$index] ) ) { + # No trimming for unnamed arguments + $this->numberedExpansionCache[$index] = $this->parent->expand( + $this->numberedArgs[$index], + PPFrame::STRIP_COMMENTS + ); + } + return $this->numberedExpansionCache[$index]; + } + + /** + * @param string $name + * @return string|bool + */ + public function getNamedArgument( $name ) { + if ( !isset( $this->namedArgs[$name] ) ) { + return false; + } + if ( !isset( $this->namedExpansionCache[$name] ) ) { + # Trim named arguments post-expand, for backwards compatibility + $this->namedExpansionCache[$name] = trim( + $this->parent->expand( $this->namedArgs[$name], PPFrame::STRIP_COMMENTS ) ); + } + return $this->namedExpansionCache[$name]; + } + + /** + * @param int|string $name + * @return string|bool + */ + public function getArgument( $name ) { + $text = $this->getNumberedArgument( $name ); + if ( $text === false ) { + $text = $this->getNamedArgument( $name ); + } + return $text; + } + + /** + * Return true if the frame is a template frame + * + * @return bool + */ + public function isTemplate() { + return true; + } + + public function setVolatile( $flag = true ) { + parent::setVolatile( $flag ); + $this->parent->setVolatile( $flag ); + } + + public function setTTL( $ttl ) { + parent::setTTL( $ttl ); + $this->parent->setTTL( $ttl ); + } +} + +/** + * Expansion frame with custom arguments + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPCustomFrame_DOM extends PPFrame_DOM { + + public $args; + + public function __construct( $preprocessor, $args ) { + parent::__construct( $preprocessor ); + $this->args = $args; + } + + public function __toString() { + $s = 'cstmframe{'; + $first = true; + foreach ( $this->args as $name => $value ) { + if ( $first ) { + $first = false; + } else { + $s .= ', '; + } + $s .= "\"$name\":\"" . + str_replace( '"', '\\"', $value->__toString() ) . '"'; + } + $s .= '}'; + return $s; + } + + /** + * @return bool + */ + public function isEmpty() { + return !count( $this->args ); + } + + /** + * @param int|string $index + * @return string|bool + */ + public function getArgument( $index ) { + if ( !isset( $this->args[$index] ) ) { + return false; + } + return $this->args[$index]; + } + + public function getArguments() { + return $this->args; + } +} + +/** + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPNode_DOM implements PPNode { + + /** + * @var DOMElement + */ + public $node; + public $xpath; + + public function __construct( $node, $xpath = false ) { + $this->node = $node; + } + + /** + * @return DOMXPath + */ + public function getXPath() { + if ( $this->xpath === null ) { + $this->xpath = new DOMXPath( $this->node->ownerDocument ); + } + return $this->xpath; + } + + public function __toString() { + if ( $this->node instanceof DOMNodeList ) { + $s = ''; + foreach ( $this->node as $node ) { + $s .= $node->ownerDocument->saveXML( $node ); + } + } else { + $s = $this->node->ownerDocument->saveXML( $this->node ); + } + return $s; + } + + /** + * @return bool|PPNode_DOM + */ + public function getChildren() { + return $this->node->childNodes ? new self( $this->node->childNodes ) : false; + } + + /** + * @return bool|PPNode_DOM + */ + public function getFirstChild() { + return $this->node->firstChild ? new self( $this->node->firstChild ) : false; + } + + /** + * @return bool|PPNode_DOM + */ + public function getNextSibling() { + return $this->node->nextSibling ? new self( $this->node->nextSibling ) : false; + } + + /** + * @param string $type + * + * @return bool|PPNode_DOM + */ + public function getChildrenOfType( $type ) { + return new self( $this->getXPath()->query( $type, $this->node ) ); + } + + /** + * @return int + */ + public function getLength() { + if ( $this->node instanceof DOMNodeList ) { + return $this->node->length; + } else { + return false; + } + } + + /** + * @param int $i + * @return bool|PPNode_DOM + */ + public function item( $i ) { + $item = $this->node->item( $i ); + return $item ? new self( $item ) : false; + } + + /** + * @return string + */ + public function getName() { + if ( $this->node instanceof DOMNodeList ) { + return '#nodelist'; + } else { + return $this->node->nodeName; + } + } + + /** + * Split a "<part>" node into an associative array containing: + * - name PPNode name + * - index String index + * - value PPNode value + * + * @throws MWException + * @return array + */ + public function splitArg() { + $xpath = $this->getXPath(); + $names = $xpath->query( 'name', $this->node ); + $values = $xpath->query( 'value', $this->node ); + if ( !$names->length || !$values->length ) { + throw new MWException( 'Invalid brace node passed to ' . __METHOD__ ); + } + $name = $names->item( 0 ); + $index = $name->getAttribute( 'index' ); + return [ + 'name' => new self( $name ), + 'index' => $index, + 'value' => new self( $values->item( 0 ) ) ]; + } + + /** + * Split an "<ext>" node into an associative array containing name, attr, inner and close + * All values in the resulting array are PPNodes. Inner and close are optional. + * + * @throws MWException + * @return array + */ + public function splitExt() { + $xpath = $this->getXPath(); + $names = $xpath->query( 'name', $this->node ); + $attrs = $xpath->query( 'attr', $this->node ); + $inners = $xpath->query( 'inner', $this->node ); + $closes = $xpath->query( 'close', $this->node ); + if ( !$names->length || !$attrs->length ) { + throw new MWException( 'Invalid ext node passed to ' . __METHOD__ ); + } + $parts = [ + 'name' => new self( $names->item( 0 ) ), + 'attr' => new self( $attrs->item( 0 ) ) ]; + if ( $inners->length ) { + $parts['inner'] = new self( $inners->item( 0 ) ); + } + if ( $closes->length ) { + $parts['close'] = new self( $closes->item( 0 ) ); + } + return $parts; + } + + /** + * Split a "<h>" node + * @throws MWException + * @return array + */ + public function splitHeading() { + if ( $this->getName() !== 'h' ) { + throw new MWException( 'Invalid h node passed to ' . __METHOD__ ); + } + return [ + 'i' => $this->node->getAttribute( 'i' ), + 'level' => $this->node->getAttribute( 'level' ), + 'contents' => $this->getChildren() + ]; + } +} diff --git a/www/wiki/includes/parser/Preprocessor_Hash.php b/www/wiki/includes/parser/Preprocessor_Hash.php new file mode 100644 index 00000000..c7f630d5 --- /dev/null +++ b/www/wiki/includes/parser/Preprocessor_Hash.php @@ -0,0 +1,2258 @@ +<?php +/** + * Preprocessor using PHP arrays + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ + +/** + * Differences from DOM schema: + * * attribute nodes are children + * * "<h>" nodes that aren't at the top are replaced with <possible-h> + * + * Nodes are stored in a recursive array data structure. A node store is an + * array where each element may be either a scalar (representing a text node) + * or a "descriptor", which is a two-element array where the first element is + * the node name and the second element is the node store for the children. + * + * Attributes are represented as children that have a node name starting with + * "@", and a single text node child. + * + * @todo: Consider replacing descriptor arrays with objects of a new class. + * Benchmark and measure resulting memory impact. + * + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class Preprocessor_Hash extends Preprocessor { + + /** + * @var Parser + */ + public $parser; + + const CACHE_PREFIX = 'preprocess-hash'; + const CACHE_VERSION = 2; + + public function __construct( $parser ) { + $this->parser = $parser; + } + + /** + * @return PPFrame_Hash + */ + public function newFrame() { + return new PPFrame_Hash( $this ); + } + + /** + * @param array $args + * @return PPCustomFrame_Hash + */ + public function newCustomFrame( $args ) { + return new PPCustomFrame_Hash( $this, $args ); + } + + /** + * @param array $values + * @return PPNode_Hash_Array + */ + public function newPartNodeArray( $values ) { + $list = []; + + foreach ( $values as $k => $val ) { + if ( is_int( $k ) ) { + $store = [ [ 'part', [ + [ 'name', [ [ '@index', [ $k ] ] ] ], + [ 'value', [ strval( $val ) ] ], + ] ] ]; + } else { + $store = [ [ 'part', [ + [ 'name', [ strval( $k ) ] ], + '=', + [ 'value', [ strval( $val ) ] ], + ] ] ]; + } + + $list[] = new PPNode_Hash_Tree( $store, 0 ); + } + + $node = new PPNode_Hash_Array( $list ); + return $node; + } + + /** + * Preprocess some wikitext and return the document tree. + * + * @param string $text The text to parse + * @param int $flags Bitwise combination of: + * Parser::PTD_FOR_INCLUSION Handle "<noinclude>" and "<includeonly>" as if the text is being + * included. Default is to assume a direct page view. + * + * The generated DOM tree must depend only on the input text and the flags. + * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of T6899. + * + * Any flag added to the $flags parameter here, or any other parameter liable to cause a + * change in the DOM tree for a given text, must be passed through the section identifier + * in the section edit link and thus back to extractSections(). + * + * @throws MWException + * @return PPNode_Hash_Tree + */ + public function preprocessToObj( $text, $flags = 0 ) { + global $wgDisableLangConversion; + + $tree = $this->cacheGetTree( $text, $flags ); + if ( $tree !== false ) { + $store = json_decode( $tree ); + if ( is_array( $store ) ) { + return new PPNode_Hash_Tree( $store, 0 ); + } + } + + $forInclusion = $flags & Parser::PTD_FOR_INCLUSION; + + $xmlishElements = $this->parser->getStripList(); + $xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ]; + $enableOnlyinclude = false; + if ( $forInclusion ) { + $ignoredTags = [ 'includeonly', '/includeonly' ]; + $ignoredElements = [ 'noinclude' ]; + $xmlishElements[] = 'noinclude'; + if ( strpos( $text, '<onlyinclude>' ) !== false + && strpos( $text, '</onlyinclude>' ) !== false + ) { + $enableOnlyinclude = true; + } + } else { + $ignoredTags = [ 'noinclude', '/noinclude', 'onlyinclude', '/onlyinclude' ]; + $ignoredElements = [ 'includeonly' ]; + $xmlishElements[] = 'includeonly'; + } + $xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) ); + + // Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset + $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA"; + + $stack = new PPDStack_Hash; + + $searchBase = "[{<\n"; + if ( !$wgDisableLangConversion ) { + $searchBase .= '-'; + } + + // For fast reverse searches + $revText = strrev( $text ); + $lengthText = strlen( $text ); + + // Input pointer, starts out pointing to a pseudo-newline before the start + $i = 0; + // Current accumulator. See the doc comment for Preprocessor_Hash for the format. + $accum =& $stack->getAccum(); + // True to find equals signs in arguments + $findEquals = false; + // True to take notice of pipe characters + $findPipe = false; + $headingIndex = 1; + // True if $i is inside a possible heading + $inHeading = false; + // True if there are no more greater-than (>) signs right of $i + $noMoreGT = false; + // Map of tag name => true if there are no more closing tags of given type right of $i + $noMoreClosingTag = []; + // True to ignore all input up to the next <onlyinclude> + $findOnlyinclude = $enableOnlyinclude; + // Do a line-start run without outputting an LF character + $fakeLineStart = true; + + while ( true ) { + // $this->memCheck(); + + if ( $findOnlyinclude ) { + // Ignore all input up to the next <onlyinclude> + $startPos = strpos( $text, '<onlyinclude>', $i ); + if ( $startPos === false ) { + // Ignored section runs to the end + $accum[] = [ 'ignore', [ substr( $text, $i ) ] ]; + break; + } + $tagEndPos = $startPos + strlen( '<onlyinclude>' ); // past-the-end + $accum[] = [ 'ignore', [ substr( $text, $i, $tagEndPos - $i ) ] ]; + $i = $tagEndPos; + $findOnlyinclude = false; + } + + if ( $fakeLineStart ) { + $found = 'line-start'; + $curChar = ''; + } else { + # Find next opening brace, closing brace or pipe + $search = $searchBase; + if ( $stack->top === false ) { + $currentClosing = ''; + } else { + $currentClosing = $stack->top->close; + $search .= $currentClosing; + } + if ( $findPipe ) { + $search .= '|'; + } + if ( $findEquals ) { + // First equals will be for the template + $search .= '='; + } + $rule = null; + # Output literal section, advance input counter + $literalLength = strcspn( $text, $search, $i ); + if ( $literalLength > 0 ) { + self::addLiteral( $accum, substr( $text, $i, $literalLength ) ); + $i += $literalLength; + } + if ( $i >= $lengthText ) { + if ( $currentClosing == "\n" ) { + // Do a past-the-end run to finish off the heading + $curChar = ''; + $found = 'line-end'; + } else { + # All done + break; + } + } else { + $curChar = $curTwoChar = $text[$i]; + if ( ( $i + 1 ) < $lengthText ) { + $curTwoChar .= $text[$i + 1]; + } + if ( $curChar == '|' ) { + $found = 'pipe'; + } elseif ( $curChar == '=' ) { + $found = 'equals'; + } elseif ( $curChar == '<' ) { + $found = 'angle'; + } elseif ( $curChar == "\n" ) { + if ( $inHeading ) { + $found = 'line-end'; + } else { + $found = 'line-start'; + } + } elseif ( $curTwoChar == $currentClosing ) { + $found = 'close'; + $curChar = $curTwoChar; + } elseif ( $curChar == $currentClosing ) { + $found = 'close'; + } elseif ( isset( $this->rules[$curTwoChar] ) ) { + $curChar = $curTwoChar; + $found = 'open'; + $rule = $this->rules[$curChar]; + } elseif ( isset( $this->rules[$curChar] ) ) { + $found = 'open'; + $rule = $this->rules[$curChar]; + } else { + # Some versions of PHP have a strcspn which stops on + # null characters; ignore these and continue. + # We also may get '-' and '}' characters here which + # don't match -{ or $currentClosing. Add these to + # output and continue. + if ( $curChar == '-' || $curChar == '}' ) { + self::addLiteral( $accum, $curChar ); + } + ++$i; + continue; + } + } + } + + if ( $found == 'angle' ) { + $matches = false; + // Handle </onlyinclude> + if ( $enableOnlyinclude + && substr( $text, $i, strlen( '</onlyinclude>' ) ) == '</onlyinclude>' + ) { + $findOnlyinclude = true; + continue; + } + + // Determine element name + if ( !preg_match( $elementsRegex, $text, $matches, 0, $i + 1 ) ) { + // Element name missing or not listed + self::addLiteral( $accum, '<' ); + ++$i; + continue; + } + // Handle comments + if ( isset( $matches[2] ) && $matches[2] == '!--' ) { + // To avoid leaving blank lines, when a sequence of + // space-separated comments is both preceded and followed by + // a newline (ignoring spaces), then + // trim leading and trailing spaces and the trailing newline. + + // Find the end + $endPos = strpos( $text, '-->', $i + 4 ); + if ( $endPos === false ) { + // Unclosed comment in input, runs to end + $inner = substr( $text, $i ); + $accum[] = [ 'comment', [ $inner ] ]; + $i = $lengthText; + } else { + // Search backwards for leading whitespace + $wsStart = $i ? ( $i - strspn( $revText, " \t", $lengthText - $i ) ) : 0; + + // Search forwards for trailing whitespace + // $wsEnd will be the position of the last space (or the '>' if there's none) + $wsEnd = $endPos + 2 + strspn( $text, " \t", $endPos + 3 ); + + // Keep looking forward as long as we're finding more + // comments. + $comments = [ [ $wsStart, $wsEnd ] ]; + while ( substr( $text, $wsEnd + 1, 4 ) == '<!--' ) { + $c = strpos( $text, '-->', $wsEnd + 4 ); + if ( $c === false ) { + break; + } + $c = $c + 2 + strspn( $text, " \t", $c + 3 ); + $comments[] = [ $wsEnd + 1, $c ]; + $wsEnd = $c; + } + + // Eat the line if possible + // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at + // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but + // it's a possible beneficial b/c break. + if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n" + && substr( $text, $wsEnd + 1, 1 ) == "\n" + ) { + // Remove leading whitespace from the end of the accumulator + $wsLength = $i - $wsStart; + $endIndex = count( $accum ) - 1; + + // Sanity check + if ( $wsLength > 0 + && $endIndex >= 0 + && is_string( $accum[$endIndex] ) + && strspn( $accum[$endIndex], " \t", -$wsLength ) === $wsLength + ) { + $accum[$endIndex] = substr( $accum[$endIndex], 0, -$wsLength ); + } + + // Dump all but the last comment to the accumulator + foreach ( $comments as $j => $com ) { + $startPos = $com[0]; + $endPos = $com[1] + 1; + if ( $j == ( count( $comments ) - 1 ) ) { + break; + } + $inner = substr( $text, $startPos, $endPos - $startPos ); + $accum[] = [ 'comment', [ $inner ] ]; + } + + // Do a line-start run next time to look for headings after the comment + $fakeLineStart = true; + } else { + // No line to eat, just take the comment itself + $startPos = $i; + $endPos += 2; + } + + if ( $stack->top ) { + $part = $stack->top->getCurrentPart(); + if ( !( isset( $part->commentEnd ) && $part->commentEnd == $wsStart - 1 ) ) { + $part->visualEnd = $wsStart; + } + // Else comments abutting, no change in visual end + $part->commentEnd = $endPos; + } + $i = $endPos + 1; + $inner = substr( $text, $startPos, $endPos - $startPos + 1 ); + $accum[] = [ 'comment', [ $inner ] ]; + } + continue; + } + $name = $matches[1]; + $lowerName = strtolower( $name ); + $attrStart = $i + strlen( $name ) + 1; + + // Find end of tag + $tagEndPos = $noMoreGT ? false : strpos( $text, '>', $attrStart ); + if ( $tagEndPos === false ) { + // Infinite backtrack + // Disable tag search to prevent worst-case O(N^2) performance + $noMoreGT = true; + self::addLiteral( $accum, '<' ); + ++$i; + continue; + } + + // Handle ignored tags + if ( in_array( $lowerName, $ignoredTags ) ) { + $accum[] = [ 'ignore', [ substr( $text, $i, $tagEndPos - $i + 1 ) ] ]; + $i = $tagEndPos + 1; + continue; + } + + $tagStartPos = $i; + if ( $text[$tagEndPos - 1] == '/' ) { + // Short end tag + $attrEnd = $tagEndPos - 1; + $inner = null; + $i = $tagEndPos + 1; + $close = null; + } else { + $attrEnd = $tagEndPos; + // Find closing tag + if ( + !isset( $noMoreClosingTag[$name] ) && + preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i", + $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 ) + ) { + $inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 ); + $i = $matches[0][1] + strlen( $matches[0][0] ); + $close = $matches[0][0]; + } else { + // No end tag + if ( in_array( $name, $xmlishAllowMissingEndTag ) ) { + // Let it run out to the end of the text. + $inner = substr( $text, $tagEndPos + 1 ); + $i = $lengthText; + $close = null; + } else { + // Don't match the tag, treat opening tag as literal and resume parsing. + $i = $tagEndPos + 1; + self::addLiteral( $accum, + substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) ); + // Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>... + $noMoreClosingTag[$name] = true; + continue; + } + } + } + // <includeonly> and <noinclude> just become <ignore> tags + if ( in_array( $lowerName, $ignoredElements ) ) { + $accum[] = [ 'ignore', [ substr( $text, $tagStartPos, $i - $tagStartPos ) ] ]; + continue; + } + + if ( $attrEnd <= $attrStart ) { + $attr = ''; + } else { + // Note that the attr element contains the whitespace between name and attribute, + // this is necessary for precise reconstruction during pre-save transform. + $attr = substr( $text, $attrStart, $attrEnd - $attrStart ); + } + + $children = [ + [ 'name', [ $name ] ], + [ 'attr', [ $attr ] ] ]; + if ( $inner !== null ) { + $children[] = [ 'inner', [ $inner ] ]; + } + if ( $close !== null ) { + $children[] = [ 'close', [ $close ] ]; + } + $accum[] = [ 'ext', $children ]; + } elseif ( $found == 'line-start' ) { + // Is this the start of a heading? + // Line break belongs before the heading element in any case + if ( $fakeLineStart ) { + $fakeLineStart = false; + } else { + self::addLiteral( $accum, $curChar ); + $i++; + } + + $count = strspn( $text, '=', $i, 6 ); + if ( $count == 1 && $findEquals ) { + // DWIM: This looks kind of like a name/value separator. + // Let's let the equals handler have it and break the potential + // heading. This is heuristic, but AFAICT the methods for + // completely correct disambiguation are very complex. + } elseif ( $count > 0 ) { + $piece = [ + 'open' => "\n", + 'close' => "\n", + 'parts' => [ new PPDPart_Hash( str_repeat( '=', $count ) ) ], + 'startPos' => $i, + 'count' => $count ]; + $stack->push( $piece ); + $accum =& $stack->getAccum(); + $stackFlags = $stack->getFlags(); + if ( isset( $stackFlags['findEquals'] ) ) { + $findEquals = $stackFlags['findEquals']; + } + if ( isset( $stackFlags['findPipe'] ) ) { + $findPipe = $stackFlags['findPipe']; + } + if ( isset( $stackFlags['inHeading'] ) ) { + $inHeading = $stackFlags['inHeading']; + } + $i += $count; + } + } elseif ( $found == 'line-end' ) { + $piece = $stack->top; + // A heading must be open, otherwise \n wouldn't have been in the search list + assert( $piece->open === "\n" ); + $part = $piece->getCurrentPart(); + // Search back through the input to see if it has a proper close. + // Do this using the reversed string since the other solutions + // (end anchor, etc.) are inefficient. + $wsLength = strspn( $revText, " \t", $lengthText - $i ); + $searchStart = $i - $wsLength; + if ( isset( $part->commentEnd ) && $searchStart - 1 == $part->commentEnd ) { + // Comment found at line end + // Search for equals signs before the comment + $searchStart = $part->visualEnd; + $searchStart -= strspn( $revText, " \t", $lengthText - $searchStart ); + } + $count = $piece->count; + $equalsLength = strspn( $revText, '=', $lengthText - $searchStart ); + if ( $equalsLength > 0 ) { + if ( $searchStart - $equalsLength == $piece->startPos ) { + // This is just a single string of equals signs on its own line + // Replicate the doHeadings behavior /={count}(.+)={count}/ + // First find out how many equals signs there really are (don't stop at 6) + $count = $equalsLength; + if ( $count < 3 ) { + $count = 0; + } else { + $count = min( 6, intval( ( $count - 1 ) / 2 ) ); + } + } else { + $count = min( $equalsLength, $count ); + } + if ( $count > 0 ) { + // Normal match, output <h> + $element = [ [ 'possible-h', + array_merge( + [ + [ '@level', [ $count ] ], + [ '@i', [ $headingIndex++ ] ] + ], + $accum + ) + ] ]; + } else { + // Single equals sign on its own line, count=0 + $element = $accum; + } + } else { + // No match, no <h>, just pass down the inner text + $element = $accum; + } + // Unwind the stack + $stack->pop(); + $accum =& $stack->getAccum(); + $stackFlags = $stack->getFlags(); + if ( isset( $stackFlags['findEquals'] ) ) { + $findEquals = $stackFlags['findEquals']; + } + if ( isset( $stackFlags['findPipe'] ) ) { + $findPipe = $stackFlags['findPipe']; + } + if ( isset( $stackFlags['inHeading'] ) ) { + $inHeading = $stackFlags['inHeading']; + } + + // Append the result to the enclosing accumulator + array_splice( $accum, count( $accum ), 0, $element ); + + // Note that we do NOT increment the input pointer. + // This is because the closing linebreak could be the opening linebreak of + // another heading. Infinite loops are avoided because the next iteration MUST + // hit the heading open case above, which unconditionally increments the + // input pointer. + } elseif ( $found == 'open' ) { + # count opening brace characters + $curLen = strlen( $curChar ); + $count = ( $curLen > 1 ) ? + # allow the final character to repeat + strspn( $text, $curChar[$curLen - 1], $i + 1 ) + 1 : + strspn( $text, $curChar, $i ); + + $savedPrefix = ''; + $lineStart = ( $i > 0 && $text[$i - 1] == "\n" ); + + if ( $curChar === "-{" && $count > $curLen ) { + // -{ => {{ transition because rightmost wins + $savedPrefix = '-'; + $i++; + $curChar = '{'; + $count--; + $rule = $this->rules[$curChar]; + } + + # we need to add to stack only if opening brace count is enough for one of the rules + if ( $count >= $rule['min'] ) { + # Add it to the stack + $piece = [ + 'open' => $curChar, + 'close' => $rule['end'], + 'savedPrefix' => $savedPrefix, + 'count' => $count, + 'lineStart' => $lineStart, + ]; + + $stack->push( $piece ); + $accum =& $stack->getAccum(); + $stackFlags = $stack->getFlags(); + if ( isset( $stackFlags['findEquals'] ) ) { + $findEquals = $stackFlags['findEquals']; + } + if ( isset( $stackFlags['findPipe'] ) ) { + $findPipe = $stackFlags['findPipe']; + } + if ( isset( $stackFlags['inHeading'] ) ) { + $inHeading = $stackFlags['inHeading']; + } + } else { + # Add literal brace(s) + self::addLiteral( $accum, $savedPrefix . str_repeat( $curChar, $count ) ); + } + $i += $count; + } elseif ( $found == 'close' ) { + $piece = $stack->top; + # lets check if there are enough characters for closing brace + $maxCount = $piece->count; + if ( $piece->close === '}-' && $curChar === '}' ) { + $maxCount--; # don't try to match closing '-' as a '}' + } + $curLen = strlen( $curChar ); + $count = ( $curLen > 1 ) ? $curLen : + strspn( $text, $curChar, $i, $maxCount ); + + # check for maximum matching characters (if there are 5 closing + # characters, we will probably need only 3 - depending on the rules) + $rule = $this->rules[$piece->open]; + if ( $count > $rule['max'] ) { + # The specified maximum exists in the callback array, unless the caller + # has made an error + $matchingCount = $rule['max']; + } else { + # Count is less than the maximum + # Skip any gaps in the callback array to find the true largest match + # Need to use array_key_exists not isset because the callback can be null + $matchingCount = $count; + while ( $matchingCount > 0 && !array_key_exists( $matchingCount, $rule['names'] ) ) { + --$matchingCount; + } + } + + if ( $matchingCount <= 0 ) { + # No matching element found in callback array + # Output a literal closing brace and continue + $endText = substr( $text, $i, $count ); + self::addLiteral( $accum, $endText ); + $i += $count; + continue; + } + $name = $rule['names'][$matchingCount]; + if ( $name === null ) { + // No element, just literal text + $endText = substr( $text, $i, $matchingCount ); + $element = $piece->breakSyntax( $matchingCount ); + self::addLiteral( $element, $endText ); + } else { + # Create XML element + $parts = $piece->parts; + $titleAccum = $parts[0]->out; + unset( $parts[0] ); + + $children = []; + + # The invocation is at the start of the line if lineStart is set in + # the stack, and all opening brackets are used up. + if ( $maxCount == $matchingCount && + !empty( $piece->lineStart ) && + strlen( $piece->savedPrefix ) == 0 ) { + $children[] = [ '@lineStart', [ 1 ] ]; + } + $titleNode = [ 'title', $titleAccum ]; + $children[] = $titleNode; + $argIndex = 1; + foreach ( $parts as $part ) { + if ( isset( $part->eqpos ) ) { + $equalsNode = $part->out[$part->eqpos]; + $nameNode = [ 'name', array_slice( $part->out, 0, $part->eqpos ) ]; + $valueNode = [ 'value', array_slice( $part->out, $part->eqpos + 1 ) ]; + $partNode = [ 'part', [ $nameNode, $equalsNode, $valueNode ] ]; + $children[] = $partNode; + } else { + $nameNode = [ 'name', [ [ '@index', [ $argIndex++ ] ] ] ]; + $valueNode = [ 'value', $part->out ]; + $partNode = [ 'part', [ $nameNode, $valueNode ] ]; + $children[] = $partNode; + } + } + $element = [ [ $name, $children ] ]; + } + + # Advance input pointer + $i += $matchingCount; + + # Unwind the stack + $stack->pop(); + $accum =& $stack->getAccum(); + + # Re-add the old stack element if it still has unmatched opening characters remaining + if ( $matchingCount < $piece->count ) { + $piece->parts = [ new PPDPart_Hash ]; + $piece->count -= $matchingCount; + # do we still qualify for any callback with remaining count? + $min = $this->rules[$piece->open]['min']; + if ( $piece->count >= $min ) { + $stack->push( $piece ); + $accum =& $stack->getAccum(); + } elseif ( $piece->count == 1 && $piece->open === '{' && $piece->savedPrefix === '-' ) { + $piece->savedPrefix = ''; + $piece->open = '-{'; + $piece->count = 2; + $piece->close = $this->rules[$piece->open]['end']; + $stack->push( $piece ); + $accum =& $stack->getAccum(); + } else { + $s = substr( $piece->open, 0, -1 ); + $s .= str_repeat( + substr( $piece->open, -1 ), + $piece->count - strlen( $s ) + ); + self::addLiteral( $accum, $piece->savedPrefix . $s ); + } + } elseif ( $piece->savedPrefix !== '' ) { + self::addLiteral( $accum, $piece->savedPrefix ); + } + + $stackFlags = $stack->getFlags(); + if ( isset( $stackFlags['findEquals'] ) ) { + $findEquals = $stackFlags['findEquals']; + } + if ( isset( $stackFlags['findPipe'] ) ) { + $findPipe = $stackFlags['findPipe']; + } + if ( isset( $stackFlags['inHeading'] ) ) { + $inHeading = $stackFlags['inHeading']; + } + + # Add XML element to the enclosing accumulator + array_splice( $accum, count( $accum ), 0, $element ); + } elseif ( $found == 'pipe' ) { + $findEquals = true; // shortcut for getFlags() + $stack->addPart(); + $accum =& $stack->getAccum(); + ++$i; + } elseif ( $found == 'equals' ) { + $findEquals = false; // shortcut for getFlags() + $accum[] = [ 'equals', [ '=' ] ]; + $stack->getCurrentPart()->eqpos = count( $accum ) - 1; + ++$i; + } + } + + # Output any remaining unclosed brackets + foreach ( $stack->stack as $piece ) { + array_splice( $stack->rootAccum, count( $stack->rootAccum ), 0, $piece->breakSyntax() ); + } + + # Enable top-level headings + foreach ( $stack->rootAccum as &$node ) { + if ( is_array( $node ) && $node[PPNode_Hash_Tree::NAME] === 'possible-h' ) { + $node[PPNode_Hash_Tree::NAME] = 'h'; + } + } + + $rootStore = [ [ 'root', $stack->rootAccum ] ]; + $rootNode = new PPNode_Hash_Tree( $rootStore, 0 ); + + // Cache + $tree = json_encode( $rootStore, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE ); + if ( $tree !== false ) { + $this->cacheSetTree( $text, $flags, $tree ); + } + + return $rootNode; + } + + private static function addLiteral( array &$accum, $text ) { + $n = count( $accum ); + if ( $n && is_string( $accum[$n - 1] ) ) { + $accum[$n - 1] .= $text; + } else { + $accum[] = $text; + } + } +} + +/** + * Stack class to help Preprocessor::preprocessToObj() + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPDStack_Hash extends PPDStack { + + public function __construct() { + $this->elementClass = PPDStackElement_Hash::class; + parent::__construct(); + $this->rootAccum = []; + } +} + +/** + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPDStackElement_Hash extends PPDStackElement { + + public function __construct( $data = [] ) { + $this->partClass = PPDPart_Hash::class; + parent::__construct( $data ); + } + + /** + * Get the accumulator that would result if the close is not found. + * + * @param int|bool $openingCount + * @return array + */ + public function breakSyntax( $openingCount = false ) { + if ( $this->open == "\n" ) { + $accum = array_merge( [ $this->savedPrefix ], $this->parts[0]->out ); + } else { + if ( $openingCount === false ) { + $openingCount = $this->count; + } + $s = substr( $this->open, 0, -1 ); + $s .= str_repeat( + substr( $this->open, -1 ), + $openingCount - strlen( $s ) + ); + $accum = [ $this->savedPrefix . $s ]; + $lastIndex = 0; + $first = true; + foreach ( $this->parts as $part ) { + if ( $first ) { + $first = false; + } elseif ( is_string( $accum[$lastIndex] ) ) { + $accum[$lastIndex] .= '|'; + } else { + $accum[++$lastIndex] = '|'; + } + foreach ( $part->out as $node ) { + if ( is_string( $node ) && is_string( $accum[$lastIndex] ) ) { + $accum[$lastIndex] .= $node; + } else { + $accum[++$lastIndex] = $node; + } + } + } + } + return $accum; + } +} + +/** + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPDPart_Hash extends PPDPart { + + public function __construct( $out = '' ) { + if ( $out !== '' ) { + $accum = [ $out ]; + } else { + $accum = []; + } + parent::__construct( $accum ); + } +} + +/** + * An expansion frame, used as a context to expand the result of preprocessToObj() + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPFrame_Hash implements PPFrame { + + /** + * @var Parser + */ + public $parser; + + /** + * @var Preprocessor + */ + public $preprocessor; + + /** + * @var Title + */ + public $title; + public $titleCache; + + /** + * Hashtable listing templates which are disallowed for expansion in this frame, + * having been encountered previously in parent frames. + */ + public $loopCheckHash; + + /** + * Recursion depth of this frame, top = 0 + * Note that this is NOT the same as expansion depth in expand() + */ + public $depth; + + private $volatile = false; + private $ttl = null; + + /** + * @var array + */ + protected $childExpansionCache; + + /** + * Construct a new preprocessor frame. + * @param Preprocessor $preprocessor The parent preprocessor + */ + public function __construct( $preprocessor ) { + $this->preprocessor = $preprocessor; + $this->parser = $preprocessor->parser; + $this->title = $this->parser->mTitle; + $this->titleCache = [ $this->title ? $this->title->getPrefixedDBkey() : false ]; + $this->loopCheckHash = []; + $this->depth = 0; + $this->childExpansionCache = []; + } + + /** + * Create a new child frame + * $args is optionally a multi-root PPNode or array containing the template arguments + * + * @param array|bool|PPNode_Hash_Array $args + * @param Title|bool $title + * @param int $indexOffset + * @throws MWException + * @return PPTemplateFrame_Hash + */ + public function newChild( $args = false, $title = false, $indexOffset = 0 ) { + $namedArgs = []; + $numberedArgs = []; + if ( $title === false ) { + $title = $this->title; + } + if ( $args !== false ) { + if ( $args instanceof PPNode_Hash_Array ) { + $args = $args->value; + } elseif ( !is_array( $args ) ) { + throw new MWException( __METHOD__ . ': $args must be array or PPNode_Hash_Array' ); + } + foreach ( $args as $arg ) { + $bits = $arg->splitArg(); + if ( $bits['index'] !== '' ) { + // Numbered parameter + $index = $bits['index'] - $indexOffset; + if ( isset( $namedArgs[$index] ) || isset( $numberedArgs[$index] ) ) { + $this->parser->getOutput()->addWarning( wfMessage( 'duplicate-args-warning', + wfEscapeWikiText( $this->title ), + wfEscapeWikiText( $title ), + wfEscapeWikiText( $index ) )->text() ); + $this->parser->addTrackingCategory( 'duplicate-args-category' ); + } + $numberedArgs[$index] = $bits['value']; + unset( $namedArgs[$index] ); + } else { + // Named parameter + $name = trim( $this->expand( $bits['name'], PPFrame::STRIP_COMMENTS ) ); + if ( isset( $namedArgs[$name] ) || isset( $numberedArgs[$name] ) ) { + $this->parser->getOutput()->addWarning( wfMessage( 'duplicate-args-warning', + wfEscapeWikiText( $this->title ), + wfEscapeWikiText( $title ), + wfEscapeWikiText( $name ) )->text() ); + $this->parser->addTrackingCategory( 'duplicate-args-category' ); + } + $namedArgs[$name] = $bits['value']; + unset( $numberedArgs[$name] ); + } + } + } + return new PPTemplateFrame_Hash( $this->preprocessor, $this, $numberedArgs, $namedArgs, $title ); + } + + /** + * @throws MWException + * @param string|int $key + * @param string|PPNode $root + * @param int $flags + * @return string + */ + public function cachedExpand( $key, $root, $flags = 0 ) { + // we don't have a parent, so we don't have a cache + return $this->expand( $root, $flags ); + } + + /** + * @throws MWException + * @param string|PPNode $root + * @param int $flags + * @return string + */ + public function expand( $root, $flags = 0 ) { + static $expansionDepth = 0; + if ( is_string( $root ) ) { + return $root; + } + + if ( ++$this->parser->mPPNodeCount > $this->parser->mOptions->getMaxPPNodeCount() ) { + $this->parser->limitationWarn( 'node-count-exceeded', + $this->parser->mPPNodeCount, + $this->parser->mOptions->getMaxPPNodeCount() + ); + return '<span class="error">Node-count limit exceeded</span>'; + } + if ( $expansionDepth > $this->parser->mOptions->getMaxPPExpandDepth() ) { + $this->parser->limitationWarn( 'expansion-depth-exceeded', + $expansionDepth, + $this->parser->mOptions->getMaxPPExpandDepth() + ); + return '<span class="error">Expansion depth limit exceeded</span>'; + } + ++$expansionDepth; + if ( $expansionDepth > $this->parser->mHighestExpansionDepth ) { + $this->parser->mHighestExpansionDepth = $expansionDepth; + } + + $outStack = [ '', '' ]; + $iteratorStack = [ false, $root ]; + $indexStack = [ 0, 0 ]; + + while ( count( $iteratorStack ) > 1 ) { + $level = count( $outStack ) - 1; + $iteratorNode =& $iteratorStack[$level]; + $out =& $outStack[$level]; + $index =& $indexStack[$level]; + + if ( is_array( $iteratorNode ) ) { + if ( $index >= count( $iteratorNode ) ) { + // All done with this iterator + $iteratorStack[$level] = false; + $contextNode = false; + } else { + $contextNode = $iteratorNode[$index]; + $index++; + } + } elseif ( $iteratorNode instanceof PPNode_Hash_Array ) { + if ( $index >= $iteratorNode->getLength() ) { + // All done with this iterator + $iteratorStack[$level] = false; + $contextNode = false; + } else { + $contextNode = $iteratorNode->item( $index ); + $index++; + } + } else { + // Copy to $contextNode and then delete from iterator stack, + // because this is not an iterator but we do have to execute it once + $contextNode = $iteratorStack[$level]; + $iteratorStack[$level] = false; + } + + $newIterator = false; + $contextName = false; + $contextChildren = false; + + if ( $contextNode === false ) { + // nothing to do + } elseif ( is_string( $contextNode ) ) { + $out .= $contextNode; + } elseif ( $contextNode instanceof PPNode_Hash_Array ) { + $newIterator = $contextNode; + } elseif ( $contextNode instanceof PPNode_Hash_Attr ) { + // No output + } elseif ( $contextNode instanceof PPNode_Hash_Text ) { + $out .= $contextNode->value; + } elseif ( $contextNode instanceof PPNode_Hash_Tree ) { + $contextName = $contextNode->name; + $contextChildren = $contextNode->getRawChildren(); + } elseif ( is_array( $contextNode ) ) { + // Node descriptor array + if ( count( $contextNode ) !== 2 ) { + throw new MWException( __METHOD__. + ': found an array where a node descriptor should be' ); + } + list( $contextName, $contextChildren ) = $contextNode; + } else { + throw new MWException( __METHOD__ . ': Invalid parameter type' ); + } + + // Handle node descriptor array or tree object + if ( $contextName === false ) { + // Not a node, already handled above + } elseif ( $contextName[0] === '@' ) { + // Attribute: no output + } elseif ( $contextName === 'template' ) { + # Double-brace expansion + $bits = PPNode_Hash_Tree::splitRawTemplate( $contextChildren ); + if ( $flags & PPFrame::NO_TEMPLATES ) { + $newIterator = $this->virtualBracketedImplode( + '{{', '|', '}}', + $bits['title'], + $bits['parts'] + ); + } else { + $ret = $this->parser->braceSubstitution( $bits, $this ); + if ( isset( $ret['object'] ) ) { + $newIterator = $ret['object']; + } else { + $out .= $ret['text']; + } + } + } elseif ( $contextName === 'tplarg' ) { + # Triple-brace expansion + $bits = PPNode_Hash_Tree::splitRawTemplate( $contextChildren ); + if ( $flags & PPFrame::NO_ARGS ) { + $newIterator = $this->virtualBracketedImplode( + '{{{', '|', '}}}', + $bits['title'], + $bits['parts'] + ); + } else { + $ret = $this->parser->argSubstitution( $bits, $this ); + if ( isset( $ret['object'] ) ) { + $newIterator = $ret['object']; + } else { + $out .= $ret['text']; + } + } + } elseif ( $contextName === 'comment' ) { + # HTML-style comment + # Remove it in HTML, pre+remove and STRIP_COMMENTS modes + # Not in RECOVER_COMMENTS mode (msgnw) though. + if ( ( $this->parser->ot['html'] + || ( $this->parser->ot['pre'] && $this->parser->mOptions->getRemoveComments() ) + || ( $flags & PPFrame::STRIP_COMMENTS ) + ) && !( $flags & PPFrame::RECOVER_COMMENTS ) + ) { + $out .= ''; + } elseif ( $this->parser->ot['wiki'] && !( $flags & PPFrame::RECOVER_COMMENTS ) ) { + # Add a strip marker in PST mode so that pstPass2() can + # run some old-fashioned regexes on the result. + # Not in RECOVER_COMMENTS mode (extractSections) though. + $out .= $this->parser->insertStripItem( $contextChildren[0] ); + } else { + # Recover the literal comment in RECOVER_COMMENTS and pre+no-remove + $out .= $contextChildren[0]; + } + } elseif ( $contextName === 'ignore' ) { + # Output suppression used by <includeonly> etc. + # OT_WIKI will only respect <ignore> in substed templates. + # The other output types respect it unless NO_IGNORE is set. + # extractSections() sets NO_IGNORE and so never respects it. + if ( ( !isset( $this->parent ) && $this->parser->ot['wiki'] ) + || ( $flags & PPFrame::NO_IGNORE ) + ) { + $out .= $contextChildren[0]; + } else { + // $out .= ''; + } + } elseif ( $contextName === 'ext' ) { + # Extension tag + $bits = PPNode_Hash_Tree::splitRawExt( $contextChildren ) + + [ 'attr' => null, 'inner' => null, 'close' => null ]; + if ( $flags & PPFrame::NO_TAGS ) { + $s = '<' . $bits['name']->getFirstChild()->value; + if ( $bits['attr'] ) { + $s .= $bits['attr']->getFirstChild()->value; + } + if ( $bits['inner'] ) { + $s .= '>' . $bits['inner']->getFirstChild()->value; + if ( $bits['close'] ) { + $s .= $bits['close']->getFirstChild()->value; + } + } else { + $s .= '/>'; + } + $out .= $s; + } else { + $out .= $this->parser->extensionSubstitution( $bits, $this ); + } + } elseif ( $contextName === 'h' ) { + # Heading + if ( $this->parser->ot['html'] ) { + # Expand immediately and insert heading index marker + $s = $this->expand( $contextChildren, $flags ); + $bits = PPNode_Hash_Tree::splitRawHeading( $contextChildren ); + $titleText = $this->title->getPrefixedDBkey(); + $this->parser->mHeadings[] = [ $titleText, $bits['i'] ]; + $serial = count( $this->parser->mHeadings ) - 1; + $marker = Parser::MARKER_PREFIX . "-h-$serial-" . Parser::MARKER_SUFFIX; + $s = substr( $s, 0, $bits['level'] ) . $marker . substr( $s, $bits['level'] ); + $this->parser->mStripState->addGeneral( $marker, '' ); + $out .= $s; + } else { + # Expand in virtual stack + $newIterator = $contextChildren; + } + } else { + # Generic recursive expansion + $newIterator = $contextChildren; + } + + if ( $newIterator !== false ) { + $outStack[] = ''; + $iteratorStack[] = $newIterator; + $indexStack[] = 0; + } elseif ( $iteratorStack[$level] === false ) { + // Return accumulated value to parent + // With tail recursion + while ( $iteratorStack[$level] === false && $level > 0 ) { + $outStack[$level - 1] .= $out; + array_pop( $outStack ); + array_pop( $iteratorStack ); + array_pop( $indexStack ); + $level--; + } + } + } + --$expansionDepth; + return $outStack[0]; + } + + /** + * @param string $sep + * @param int $flags + * @param string|PPNode $args,... + * @return string + */ + public function implodeWithFlags( $sep, $flags /*, ... */ ) { + $args = array_slice( func_get_args(), 2 ); + + $first = true; + $s = ''; + foreach ( $args as $root ) { + if ( $root instanceof PPNode_Hash_Array ) { + $root = $root->value; + } + if ( !is_array( $root ) ) { + $root = [ $root ]; + } + foreach ( $root as $node ) { + if ( $first ) { + $first = false; + } else { + $s .= $sep; + } + $s .= $this->expand( $node, $flags ); + } + } + return $s; + } + + /** + * Implode with no flags specified + * This previously called implodeWithFlags but has now been inlined to reduce stack depth + * @param string $sep + * @param string|PPNode $args,... + * @return string + */ + public function implode( $sep /*, ... */ ) { + $args = array_slice( func_get_args(), 1 ); + + $first = true; + $s = ''; + foreach ( $args as $root ) { + if ( $root instanceof PPNode_Hash_Array ) { + $root = $root->value; + } + if ( !is_array( $root ) ) { + $root = [ $root ]; + } + foreach ( $root as $node ) { + if ( $first ) { + $first = false; + } else { + $s .= $sep; + } + $s .= $this->expand( $node ); + } + } + return $s; + } + + /** + * Makes an object that, when expand()ed, will be the same as one obtained + * with implode() + * + * @param string $sep + * @param string|PPNode $args,... + * @return PPNode_Hash_Array + */ + public function virtualImplode( $sep /*, ... */ ) { + $args = array_slice( func_get_args(), 1 ); + $out = []; + $first = true; + + foreach ( $args as $root ) { + if ( $root instanceof PPNode_Hash_Array ) { + $root = $root->value; + } + if ( !is_array( $root ) ) { + $root = [ $root ]; + } + foreach ( $root as $node ) { + if ( $first ) { + $first = false; + } else { + $out[] = $sep; + } + $out[] = $node; + } + } + return new PPNode_Hash_Array( $out ); + } + + /** + * Virtual implode with brackets + * + * @param string $start + * @param string $sep + * @param string $end + * @param string|PPNode $args,... + * @return PPNode_Hash_Array + */ + public function virtualBracketedImplode( $start, $sep, $end /*, ... */ ) { + $args = array_slice( func_get_args(), 3 ); + $out = [ $start ]; + $first = true; + + foreach ( $args as $root ) { + if ( $root instanceof PPNode_Hash_Array ) { + $root = $root->value; + } + if ( !is_array( $root ) ) { + $root = [ $root ]; + } + foreach ( $root as $node ) { + if ( $first ) { + $first = false; + } else { + $out[] = $sep; + } + $out[] = $node; + } + } + $out[] = $end; + return new PPNode_Hash_Array( $out ); + } + + public function __toString() { + return 'frame{}'; + } + + /** + * @param bool $level + * @return array|bool|string + */ + public function getPDBK( $level = false ) { + if ( $level === false ) { + return $this->title->getPrefixedDBkey(); + } else { + return isset( $this->titleCache[$level] ) ? $this->titleCache[$level] : false; + } + } + + /** + * @return array + */ + public function getArguments() { + return []; + } + + /** + * @return array + */ + public function getNumberedArguments() { + return []; + } + + /** + * @return array + */ + public function getNamedArguments() { + return []; + } + + /** + * Returns true if there are no arguments in this frame + * + * @return bool + */ + public function isEmpty() { + return true; + } + + /** + * @param int|string $name + * @return bool Always false in this implementation. + */ + public function getArgument( $name ) { + return false; + } + + /** + * Returns true if the infinite loop check is OK, false if a loop is detected + * + * @param Title $title + * + * @return bool + */ + public function loopCheck( $title ) { + return !isset( $this->loopCheckHash[$title->getPrefixedDBkey()] ); + } + + /** + * Return true if the frame is a template frame + * + * @return bool + */ + public function isTemplate() { + return false; + } + + /** + * Get a title of frame + * + * @return Title + */ + public function getTitle() { + return $this->title; + } + + /** + * Set the volatile flag + * + * @param bool $flag + */ + public function setVolatile( $flag = true ) { + $this->volatile = $flag; + } + + /** + * Get the volatile flag + * + * @return bool + */ + public function isVolatile() { + return $this->volatile; + } + + /** + * Set the TTL + * + * @param int $ttl + */ + public function setTTL( $ttl ) { + if ( $ttl !== null && ( $this->ttl === null || $ttl < $this->ttl ) ) { + $this->ttl = $ttl; + } + } + + /** + * Get the TTL + * + * @return int|null + */ + public function getTTL() { + return $this->ttl; + } +} + +/** + * Expansion frame with template arguments + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPTemplateFrame_Hash extends PPFrame_Hash { + + public $numberedArgs, $namedArgs, $parent; + public $numberedExpansionCache, $namedExpansionCache; + + /** + * @param Preprocessor $preprocessor + * @param bool|PPFrame $parent + * @param array $numberedArgs + * @param array $namedArgs + * @param bool|Title $title + */ + public function __construct( $preprocessor, $parent = false, $numberedArgs = [], + $namedArgs = [], $title = false + ) { + parent::__construct( $preprocessor ); + + $this->parent = $parent; + $this->numberedArgs = $numberedArgs; + $this->namedArgs = $namedArgs; + $this->title = $title; + $pdbk = $title ? $title->getPrefixedDBkey() : false; + $this->titleCache = $parent->titleCache; + $this->titleCache[] = $pdbk; + $this->loopCheckHash = /*clone*/ $parent->loopCheckHash; + if ( $pdbk !== false ) { + $this->loopCheckHash[$pdbk] = true; + } + $this->depth = $parent->depth + 1; + $this->numberedExpansionCache = $this->namedExpansionCache = []; + } + + public function __toString() { + $s = 'tplframe{'; + $first = true; + $args = $this->numberedArgs + $this->namedArgs; + foreach ( $args as $name => $value ) { + if ( $first ) { + $first = false; + } else { + $s .= ', '; + } + $s .= "\"$name\":\"" . + str_replace( '"', '\\"', $value->__toString() ) . '"'; + } + $s .= '}'; + return $s; + } + + /** + * @throws MWException + * @param string|int $key + * @param string|PPNode $root + * @param int $flags + * @return string + */ + public function cachedExpand( $key, $root, $flags = 0 ) { + if ( isset( $this->parent->childExpansionCache[$key] ) ) { + return $this->parent->childExpansionCache[$key]; + } + $retval = $this->expand( $root, $flags ); + if ( !$this->isVolatile() ) { + $this->parent->childExpansionCache[$key] = $retval; + } + return $retval; + } + + /** + * Returns true if there are no arguments in this frame + * + * @return bool + */ + public function isEmpty() { + return !count( $this->numberedArgs ) && !count( $this->namedArgs ); + } + + /** + * @return array + */ + public function getArguments() { + $arguments = []; + foreach ( array_merge( + array_keys( $this->numberedArgs ), + array_keys( $this->namedArgs ) ) as $key ) { + $arguments[$key] = $this->getArgument( $key ); + } + return $arguments; + } + + /** + * @return array + */ + public function getNumberedArguments() { + $arguments = []; + foreach ( array_keys( $this->numberedArgs ) as $key ) { + $arguments[$key] = $this->getArgument( $key ); + } + return $arguments; + } + + /** + * @return array + */ + public function getNamedArguments() { + $arguments = []; + foreach ( array_keys( $this->namedArgs ) as $key ) { + $arguments[$key] = $this->getArgument( $key ); + } + return $arguments; + } + + /** + * @param int $index + * @return string|bool + */ + public function getNumberedArgument( $index ) { + if ( !isset( $this->numberedArgs[$index] ) ) { + return false; + } + if ( !isset( $this->numberedExpansionCache[$index] ) ) { + # No trimming for unnamed arguments + $this->numberedExpansionCache[$index] = $this->parent->expand( + $this->numberedArgs[$index], + PPFrame::STRIP_COMMENTS + ); + } + return $this->numberedExpansionCache[$index]; + } + + /** + * @param string $name + * @return string|bool + */ + public function getNamedArgument( $name ) { + if ( !isset( $this->namedArgs[$name] ) ) { + return false; + } + if ( !isset( $this->namedExpansionCache[$name] ) ) { + # Trim named arguments post-expand, for backwards compatibility + $this->namedExpansionCache[$name] = trim( + $this->parent->expand( $this->namedArgs[$name], PPFrame::STRIP_COMMENTS ) ); + } + return $this->namedExpansionCache[$name]; + } + + /** + * @param int|string $name + * @return string|bool + */ + public function getArgument( $name ) { + $text = $this->getNumberedArgument( $name ); + if ( $text === false ) { + $text = $this->getNamedArgument( $name ); + } + return $text; + } + + /** + * Return true if the frame is a template frame + * + * @return bool + */ + public function isTemplate() { + return true; + } + + public function setVolatile( $flag = true ) { + parent::setVolatile( $flag ); + $this->parent->setVolatile( $flag ); + } + + public function setTTL( $ttl ) { + parent::setTTL( $ttl ); + $this->parent->setTTL( $ttl ); + } +} + +/** + * Expansion frame with custom arguments + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPCustomFrame_Hash extends PPFrame_Hash { + + public $args; + + public function __construct( $preprocessor, $args ) { + parent::__construct( $preprocessor ); + $this->args = $args; + } + + public function __toString() { + $s = 'cstmframe{'; + $first = true; + foreach ( $this->args as $name => $value ) { + if ( $first ) { + $first = false; + } else { + $s .= ', '; + } + $s .= "\"$name\":\"" . + str_replace( '"', '\\"', $value->__toString() ) . '"'; + } + $s .= '}'; + return $s; + } + + /** + * @return bool + */ + public function isEmpty() { + return !count( $this->args ); + } + + /** + * @param int|string $index + * @return string|bool + */ + public function getArgument( $index ) { + if ( !isset( $this->args[$index] ) ) { + return false; + } + return $this->args[$index]; + } + + public function getArguments() { + return $this->args; + } +} + +/** + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPNode_Hash_Tree implements PPNode { + + public $name; + + /** + * The store array for children of this node. It is "raw" in the sense that + * nodes are two-element arrays ("descriptors") rather than PPNode_Hash_* + * objects. + */ + private $rawChildren; + + /** + * The store array for the siblings of this node, including this node itself. + */ + private $store; + + /** + * The index into $this->store which contains the descriptor of this node. + */ + private $index; + + /** + * The offset of the name within descriptors, used in some places for + * readability. + */ + const NAME = 0; + + /** + * The offset of the child list within descriptors, used in some places for + * readability. + */ + const CHILDREN = 1; + + /** + * Construct an object using the data from $store[$index]. The rest of the + * store array can be accessed via getNextSibling(). + * + * @param array $store + * @param int $index + */ + public function __construct( array $store, $index ) { + $this->store = $store; + $this->index = $index; + list( $this->name, $this->rawChildren ) = $this->store[$index]; + } + + /** + * Construct an appropriate PPNode_Hash_* object with a class that depends + * on what is at the relevant store index. + * + * @param array $store + * @param int $index + * @return PPNode_Hash_Tree|PPNode_Hash_Attr|PPNode_Hash_Text + */ + public static function factory( array $store, $index ) { + if ( !isset( $store[$index] ) ) { + return false; + } + + $descriptor = $store[$index]; + if ( is_string( $descriptor ) ) { + $class = PPNode_Hash_Text::class; + } elseif ( is_array( $descriptor ) ) { + if ( $descriptor[self::NAME][0] === '@' ) { + $class = PPNode_Hash_Attr::class; + } else { + $class = self::class; + } + } else { + throw new MWException( __METHOD__.': invalid node descriptor' ); + } + return new $class( $store, $index ); + } + + /** + * Convert a node to XML, for debugging + */ + public function __toString() { + $inner = ''; + $attribs = ''; + for ( $node = $this->getFirstChild(); $node; $node = $node->getNextSibling() ) { + if ( $node instanceof PPNode_Hash_Attr ) { + $attribs .= ' ' . $node->name . '="' . htmlspecialchars( $node->value ) . '"'; + } else { + $inner .= $node->__toString(); + } + } + if ( $inner === '' ) { + return "<{$this->name}$attribs/>"; + } else { + return "<{$this->name}$attribs>$inner</{$this->name}>"; + } + } + + /** + * @return PPNode_Hash_Array + */ + public function getChildren() { + $children = []; + foreach ( $this->rawChildren as $i => $child ) { + $children[] = self::factory( $this->rawChildren, $i ); + } + return new PPNode_Hash_Array( $children ); + } + + /** + * Get the first child, or false if there is none. Note that this will + * return a temporary proxy object: different instances will be returned + * if this is called more than once on the same node. + * + * @return PPNode_Hash_Tree|PPNode_Hash_Attr|PPNode_Hash_Text|bool + */ + public function getFirstChild() { + if ( !isset( $this->rawChildren[0] ) ) { + return false; + } else { + return self::factory( $this->rawChildren, 0 ); + } + } + + /** + * Get the next sibling, or false if there is none. Note that this will + * return a temporary proxy object: different instances will be returned + * if this is called more than once on the same node. + * + * @return PPNode_Hash_Tree|PPNode_Hash_Attr|PPNode_Hash_Text|bool + */ + public function getNextSibling() { + return self::factory( $this->store, $this->index + 1 ); + } + + /** + * Get an array of the children with a given node name + * + * @param string $name + * @return PPNode_Hash_Array + */ + public function getChildrenOfType( $name ) { + $children = []; + foreach ( $this->rawChildren as $i => $child ) { + if ( is_array( $child ) && $child[self::NAME] === $name ) { + $children[] = self::factory( $this->rawChildren, $i ); + } + } + return new PPNode_Hash_Array( $children ); + } + + /** + * Get the raw child array. For internal use. + * @return array + */ + public function getRawChildren() { + return $this->rawChildren; + } + + /** + * @return bool + */ + public function getLength() { + return false; + } + + /** + * @param int $i + * @return bool + */ + public function item( $i ) { + return false; + } + + /** + * @return string + */ + public function getName() { + return $this->name; + } + + /** + * Split a "<part>" node into an associative array containing: + * - name PPNode name + * - index String index + * - value PPNode value + * + * @throws MWException + * @return array + */ + public function splitArg() { + return self::splitRawArg( $this->rawChildren ); + } + + /** + * Like splitArg() but for a raw child array. For internal use only. + * @param array $children + * @return array + */ + public static function splitRawArg( array $children ) { + $bits = []; + foreach ( $children as $i => $child ) { + if ( !is_array( $child ) ) { + continue; + } + if ( $child[self::NAME] === 'name' ) { + $bits['name'] = new self( $children, $i ); + if ( isset( $child[self::CHILDREN][0][self::NAME] ) + && $child[self::CHILDREN][0][self::NAME] === '@index' + ) { + $bits['index'] = $child[self::CHILDREN][0][self::CHILDREN][0]; + } + } elseif ( $child[self::NAME] === 'value' ) { + $bits['value'] = new self( $children, $i ); + } + } + + if ( !isset( $bits['name'] ) ) { + throw new MWException( 'Invalid brace node passed to ' . __METHOD__ ); + } + if ( !isset( $bits['index'] ) ) { + $bits['index'] = ''; + } + return $bits; + } + + /** + * Split an "<ext>" node into an associative array containing name, attr, inner and close + * All values in the resulting array are PPNodes. Inner and close are optional. + * + * @throws MWException + * @return array + */ + public function splitExt() { + return self::splitRawExt( $this->rawChildren ); + } + + /** + * Like splitExt() but for a raw child array. For internal use only. + * @param array $children + * @return array + */ + public static function splitRawExt( array $children ) { + $bits = []; + foreach ( $children as $i => $child ) { + if ( !is_array( $child ) ) { + continue; + } + switch ( $child[self::NAME] ) { + case 'name': + $bits['name'] = new self( $children, $i ); + break; + case 'attr': + $bits['attr'] = new self( $children, $i ); + break; + case 'inner': + $bits['inner'] = new self( $children, $i ); + break; + case 'close': + $bits['close'] = new self( $children, $i ); + break; + } + } + if ( !isset( $bits['name'] ) ) { + throw new MWException( 'Invalid ext node passed to ' . __METHOD__ ); + } + return $bits; + } + + /** + * Split an "<h>" node + * + * @throws MWException + * @return array + */ + public function splitHeading() { + if ( $this->name !== 'h' ) { + throw new MWException( 'Invalid h node passed to ' . __METHOD__ ); + } + return self::splitRawHeading( $this->rawChildren ); + } + + /** + * Like splitHeading() but for a raw child array. For internal use only. + * @param array $children + * @return array + */ + public static function splitRawHeading( array $children ) { + $bits = []; + foreach ( $children as $i => $child ) { + if ( !is_array( $child ) ) { + continue; + } + if ( $child[self::NAME] === '@i' ) { + $bits['i'] = $child[self::CHILDREN][0]; + } elseif ( $child[self::NAME] === '@level' ) { + $bits['level'] = $child[self::CHILDREN][0]; + } + } + if ( !isset( $bits['i'] ) ) { + throw new MWException( 'Invalid h node passed to ' . __METHOD__ ); + } + return $bits; + } + + /** + * Split a "<template>" or "<tplarg>" node + * + * @throws MWException + * @return array + */ + public function splitTemplate() { + return self::splitRawTemplate( $this->rawChildren ); + } + + /** + * Like splitTemplate() but for a raw child array. For internal use only. + * @param array $children + * @return array + */ + public static function splitRawTemplate( array $children ) { + $parts = []; + $bits = [ 'lineStart' => '' ]; + foreach ( $children as $i => $child ) { + if ( !is_array( $child ) ) { + continue; + } + switch ( $child[self::NAME] ) { + case 'title': + $bits['title'] = new self( $children, $i ); + break; + case 'part': + $parts[] = new self( $children, $i ); + break; + case '@lineStart': + $bits['lineStart'] = '1'; + break; + } + } + if ( !isset( $bits['title'] ) ) { + throw new MWException( 'Invalid node passed to ' . __METHOD__ ); + } + $bits['parts'] = new PPNode_Hash_Array( $parts ); + return $bits; + } +} + +/** + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPNode_Hash_Text implements PPNode { + + public $value; + private $store, $index; + + /** + * Construct an object using the data from $store[$index]. The rest of the + * store array can be accessed via getNextSibling(). + * + * @param array $store + * @param int $index + */ + public function __construct( array $store, $index ) { + $this->value = $store[$index]; + if ( !is_scalar( $this->value ) ) { + throw new MWException( __CLASS__ . ' given object instead of string' ); + } + $this->store = $store; + $this->index = $index; + } + + public function __toString() { + return htmlspecialchars( $this->value ); + } + + public function getNextSibling() { + return PPNode_Hash_Tree::factory( $this->store, $this->index + 1 ); + } + + public function getChildren() { + return false; + } + + public function getFirstChild() { + return false; + } + + public function getChildrenOfType( $name ) { + return false; + } + + public function getLength() { + return false; + } + + public function item( $i ) { + return false; + } + + public function getName() { + return '#text'; + } + + public function splitArg() { + throw new MWException( __METHOD__ . ': not supported' ); + } + + public function splitExt() { + throw new MWException( __METHOD__ . ': not supported' ); + } + + public function splitHeading() { + throw new MWException( __METHOD__ . ': not supported' ); + } +} + +/** + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPNode_Hash_Array implements PPNode { + + public $value; + + public function __construct( $value ) { + $this->value = $value; + } + + public function __toString() { + return var_export( $this, true ); + } + + public function getLength() { + return count( $this->value ); + } + + public function item( $i ) { + return $this->value[$i]; + } + + public function getName() { + return '#nodelist'; + } + + public function getNextSibling() { + return false; + } + + public function getChildren() { + return false; + } + + public function getFirstChild() { + return false; + } + + public function getChildrenOfType( $name ) { + return false; + } + + public function splitArg() { + throw new MWException( __METHOD__ . ': not supported' ); + } + + public function splitExt() { + throw new MWException( __METHOD__ . ': not supported' ); + } + + public function splitHeading() { + throw new MWException( __METHOD__ . ': not supported' ); + } +} + +/** + * @ingroup Parser + */ +// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps +class PPNode_Hash_Attr implements PPNode { + + public $name, $value; + private $store, $index; + + /** + * Construct an object using the data from $store[$index]. The rest of the + * store array can be accessed via getNextSibling(). + * + * @param array $store + * @param int $index + */ + public function __construct( array $store, $index ) { + $descriptor = $store[$index]; + if ( $descriptor[PPNode_Hash_Tree::NAME][0] !== '@' ) { + throw new MWException( __METHOD__.': invalid name in attribute descriptor' ); + } + $this->name = substr( $descriptor[PPNode_Hash_Tree::NAME], 1 ); + $this->value = $descriptor[PPNode_Hash_Tree::CHILDREN][0]; + $this->store = $store; + $this->index = $index; + } + + public function __toString() { + return "<@{$this->name}>" . htmlspecialchars( $this->value ) . "</@{$this->name}>"; + } + + public function getName() { + return $this->name; + } + + public function getNextSibling() { + return PPNode_Hash_Tree::factory( $this->store, $this->index + 1 ); + } + + public function getChildren() { + return false; + } + + public function getFirstChild() { + return false; + } + + public function getChildrenOfType( $name ) { + return false; + } + + public function getLength() { + return false; + } + + public function item( $i ) { + return false; + } + + public function splitArg() { + throw new MWException( __METHOD__ . ': not supported' ); + } + + public function splitExt() { + throw new MWException( __METHOD__ . ': not supported' ); + } + + public function splitHeading() { + throw new MWException( __METHOD__ . ': not supported' ); + } +} diff --git a/www/wiki/includes/parser/RemexStripTagHandler.php b/www/wiki/includes/parser/RemexStripTagHandler.php new file mode 100644 index 00000000..2839147d --- /dev/null +++ b/www/wiki/includes/parser/RemexStripTagHandler.php @@ -0,0 +1,40 @@ +<?php + +use RemexHtml\Tokenizer\Attributes; +use RemexHtml\Tokenizer\TokenHandler; +use RemexHtml\Tokenizer\Tokenizer; + +/** + * @internal + */ +class RemexStripTagHandler implements TokenHandler { + private $text = ''; + public function getResult() { + return $this->text; + } + + function startDocument( Tokenizer $t, $fns, $fn ) { + // Do nothing. + } + function endDocument( $pos ) { + // Do nothing. + } + function error( $text, $pos ) { + // Do nothing. + } + function characters( $text, $start, $length, $sourceStart, $sourceLength ) { + $this->text .= substr( $text, $start, $length ); + } + function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) { + // Do nothing. + } + function endTag( $name, $sourceStart, $sourceLength ) { + // Do nothing. + } + function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) { + // Do nothing. + } + function comment( $text, $sourceStart, $sourceLength ) { + // Do nothing. + } +} diff --git a/www/wiki/includes/parser/Sanitizer.php b/www/wiki/includes/parser/Sanitizer.php new file mode 100644 index 00000000..4d8df231 --- /dev/null +++ b/www/wiki/includes/parser/Sanitizer.php @@ -0,0 +1,2123 @@ +<?php +/** + * HTML sanitizer for %MediaWiki. + * + * Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ + +/** + * HTML sanitizer for MediaWiki + * @ingroup Parser + */ +class Sanitizer { + /** + * Regular expression to match various types of character references in + * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences + */ + const CHAR_REFS_REGEX = + '/&([A-Za-z0-9\x80-\xff]+); + |&\#([0-9]+); + |&\#[xX]([0-9A-Fa-f]+); + |(&)/x'; + + /** + * Acceptable tag name charset from HTML5 parsing spec + * https://www.w3.org/TR/html5/syntax.html#tag-open-state + */ + const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!'; + + /** + * Blacklist for evil uris like javascript: + * WARNING: DO NOT use this in any place that actually requires blacklisting + * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the + * only way to be secure from javascript: uri based xss vectors is to whitelist + * things that you know are safe and deny everything else. + * [1]: http://ha.ckers.org/xss.html + */ + const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; + const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; + + /** + * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding. + * + * @since 1.30 + */ + const ID_PRIMARY = 0; + + /** + * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false + * if no fallback is configured. + * + * @since 1.30 + */ + const ID_FALLBACK = 1; + + /** + * List of all named character entities defined in HTML 4.01 + * https://www.w3.org/TR/html4/sgml/entities.html + * As well as ' which is only defined starting in XHTML1. + */ + private static $htmlEntities = [ + 'Aacute' => 193, + 'aacute' => 225, + 'Acirc' => 194, + 'acirc' => 226, + 'acute' => 180, + 'AElig' => 198, + 'aelig' => 230, + 'Agrave' => 192, + 'agrave' => 224, + 'alefsym' => 8501, + 'Alpha' => 913, + 'alpha' => 945, + 'amp' => 38, + 'and' => 8743, + 'ang' => 8736, + 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE. + 'Aring' => 197, + 'aring' => 229, + 'asymp' => 8776, + 'Atilde' => 195, + 'atilde' => 227, + 'Auml' => 196, + 'auml' => 228, + 'bdquo' => 8222, + 'Beta' => 914, + 'beta' => 946, + 'brvbar' => 166, + 'bull' => 8226, + 'cap' => 8745, + 'Ccedil' => 199, + 'ccedil' => 231, + 'cedil' => 184, + 'cent' => 162, + 'Chi' => 935, + 'chi' => 967, + 'circ' => 710, + 'clubs' => 9827, + 'cong' => 8773, + 'copy' => 169, + 'crarr' => 8629, + 'cup' => 8746, + 'curren' => 164, + 'dagger' => 8224, + 'Dagger' => 8225, + 'darr' => 8595, + 'dArr' => 8659, + 'deg' => 176, + 'Delta' => 916, + 'delta' => 948, + 'diams' => 9830, + 'divide' => 247, + 'Eacute' => 201, + 'eacute' => 233, + 'Ecirc' => 202, + 'ecirc' => 234, + 'Egrave' => 200, + 'egrave' => 232, + 'empty' => 8709, + 'emsp' => 8195, + 'ensp' => 8194, + 'Epsilon' => 917, + 'epsilon' => 949, + 'equiv' => 8801, + 'Eta' => 919, + 'eta' => 951, + 'ETH' => 208, + 'eth' => 240, + 'Euml' => 203, + 'euml' => 235, + 'euro' => 8364, + 'exist' => 8707, + 'fnof' => 402, + 'forall' => 8704, + 'frac12' => 189, + 'frac14' => 188, + 'frac34' => 190, + 'frasl' => 8260, + 'Gamma' => 915, + 'gamma' => 947, + 'ge' => 8805, + 'gt' => 62, + 'harr' => 8596, + 'hArr' => 8660, + 'hearts' => 9829, + 'hellip' => 8230, + 'Iacute' => 205, + 'iacute' => 237, + 'Icirc' => 206, + 'icirc' => 238, + 'iexcl' => 161, + 'Igrave' => 204, + 'igrave' => 236, + 'image' => 8465, + 'infin' => 8734, + 'int' => 8747, + 'Iota' => 921, + 'iota' => 953, + 'iquest' => 191, + 'isin' => 8712, + 'Iuml' => 207, + 'iuml' => 239, + 'Kappa' => 922, + 'kappa' => 954, + 'Lambda' => 923, + 'lambda' => 955, + 'lang' => 9001, + 'laquo' => 171, + 'larr' => 8592, + 'lArr' => 8656, + 'lceil' => 8968, + 'ldquo' => 8220, + 'le' => 8804, + 'lfloor' => 8970, + 'lowast' => 8727, + 'loz' => 9674, + 'lrm' => 8206, + 'lsaquo' => 8249, + 'lsquo' => 8216, + 'lt' => 60, + 'macr' => 175, + 'mdash' => 8212, + 'micro' => 181, + 'middot' => 183, + 'minus' => 8722, + 'Mu' => 924, + 'mu' => 956, + 'nabla' => 8711, + 'nbsp' => 160, + 'ndash' => 8211, + 'ne' => 8800, + 'ni' => 8715, + 'not' => 172, + 'notin' => 8713, + 'nsub' => 8836, + 'Ntilde' => 209, + 'ntilde' => 241, + 'Nu' => 925, + 'nu' => 957, + 'Oacute' => 211, + 'oacute' => 243, + 'Ocirc' => 212, + 'ocirc' => 244, + 'OElig' => 338, + 'oelig' => 339, + 'Ograve' => 210, + 'ograve' => 242, + 'oline' => 8254, + 'Omega' => 937, + 'omega' => 969, + 'Omicron' => 927, + 'omicron' => 959, + 'oplus' => 8853, + 'or' => 8744, + 'ordf' => 170, + 'ordm' => 186, + 'Oslash' => 216, + 'oslash' => 248, + 'Otilde' => 213, + 'otilde' => 245, + 'otimes' => 8855, + 'Ouml' => 214, + 'ouml' => 246, + 'para' => 182, + 'part' => 8706, + 'permil' => 8240, + 'perp' => 8869, + 'Phi' => 934, + 'phi' => 966, + 'Pi' => 928, + 'pi' => 960, + 'piv' => 982, + 'plusmn' => 177, + 'pound' => 163, + 'prime' => 8242, + 'Prime' => 8243, + 'prod' => 8719, + 'prop' => 8733, + 'Psi' => 936, + 'psi' => 968, + 'quot' => 34, + 'radic' => 8730, + 'rang' => 9002, + 'raquo' => 187, + 'rarr' => 8594, + 'rArr' => 8658, + 'rceil' => 8969, + 'rdquo' => 8221, + 'real' => 8476, + 'reg' => 174, + 'rfloor' => 8971, + 'Rho' => 929, + 'rho' => 961, + 'rlm' => 8207, + 'rsaquo' => 8250, + 'rsquo' => 8217, + 'sbquo' => 8218, + 'Scaron' => 352, + 'scaron' => 353, + 'sdot' => 8901, + 'sect' => 167, + 'shy' => 173, + 'Sigma' => 931, + 'sigma' => 963, + 'sigmaf' => 962, + 'sim' => 8764, + 'spades' => 9824, + 'sub' => 8834, + 'sube' => 8838, + 'sum' => 8721, + 'sup' => 8835, + 'sup1' => 185, + 'sup2' => 178, + 'sup3' => 179, + 'supe' => 8839, + 'szlig' => 223, + 'Tau' => 932, + 'tau' => 964, + 'there4' => 8756, + 'Theta' => 920, + 'theta' => 952, + 'thetasym' => 977, + 'thinsp' => 8201, + 'THORN' => 222, + 'thorn' => 254, + 'tilde' => 732, + 'times' => 215, + 'trade' => 8482, + 'Uacute' => 218, + 'uacute' => 250, + 'uarr' => 8593, + 'uArr' => 8657, + 'Ucirc' => 219, + 'ucirc' => 251, + 'Ugrave' => 217, + 'ugrave' => 249, + 'uml' => 168, + 'upsih' => 978, + 'Upsilon' => 933, + 'upsilon' => 965, + 'Uuml' => 220, + 'uuml' => 252, + 'weierp' => 8472, + 'Xi' => 926, + 'xi' => 958, + 'Yacute' => 221, + 'yacute' => 253, + 'yen' => 165, + 'Yuml' => 376, + 'yuml' => 255, + 'Zeta' => 918, + 'zeta' => 950, + 'zwj' => 8205, + 'zwnj' => 8204 + ]; + + /** + * Character entity aliases accepted by MediaWiki + */ + private static $htmlEntityAliases = [ + 'רלמ' => 'rlm', + 'رلم' => 'rlm', + ]; + + /** + * Lazy-initialised attributes regex, see getAttribsRegex() + */ + private static $attribsRegex; + + /** + * Regular expression to match HTML/XML attribute pairs within a tag. + * Allows some... latitude. Based on, + * https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state + * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes + * @return string + */ + static function getAttribsRegex() { + if ( self::$attribsRegex === null ) { + $attribFirst = "[:_\p{L}\p{N}]"; + $attrib = "[:_\.\-\p{L}\p{N}]"; + $space = '[\x09\x0a\x0c\x0d\x20]'; + self::$attribsRegex = + "/(?:^|$space)({$attribFirst}{$attrib}*) + ($space*=$space* + (?: + # The attribute value: quoted or alone + \"([^\"]*)(?:\"|\$) + | '([^']*)(?:'|\$) + | (((?!$space|>).)*) + ) + )?(?=$space|\$)/sxu"; + } + return self::$attribsRegex; + } + + /** + * Return the various lists of recognized tags + * @param array $extratags For any extra tags to include + * @param array $removetags For any tags (default or extra) to exclude + * @return array + */ + public static function getRecognizedTagData( $extratags = [], $removetags = [] ) { + global $wgAllowImageTag; + + static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, + $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; + + // Base our staticInitialised variable off of the global config state so that if the globals + // are changed (like in the screwed up test system) we will re-initialise the settings. + $globalContext = $wgAllowImageTag; + if ( !$staticInitialised || $staticInitialised != $globalContext ) { + $htmlpairsStatic = [ # Tags that must be closed + 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', + 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', + 'strike', 'strong', 'tt', 'var', 'div', 'center', + 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', + 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn', + 'kbd', 'samp', 'data', 'time', 'mark' + ]; + $htmlsingle = [ + 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link' + ]; + + # Elements that cannot have close tags. This is (not coincidentally) + # also the list of tags for which the HTML 5 parsing algorithm + # requires you to "acknowledge the token's self-closing flag", i.e. + # a self-closing tag like <br/> is not an HTML 5 parse error only + # for this list. + $htmlsingleonly = [ + 'br', 'wbr', 'hr', 'meta', 'link' + ]; + + $htmlnest = [ # Tags that can be nested--?? + 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', + 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', + 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' + ]; + $tabletags = [ # Can only appear inside table, we will close them + 'td', 'th', 'tr', + ]; + $htmllist = [ # Tags used by list + 'ul', 'ol', + ]; + $listtags = [ # Tags that can appear in a list + 'li', + ]; + + if ( $wgAllowImageTag ) { + $htmlsingle[] = 'img'; + $htmlsingleonly[] = 'img'; + } + + $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); + $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); + + # Convert them all to hashtables for faster lookup + $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', + 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ]; + foreach ( $vars as $var ) { + $$var = array_flip( $$var ); + } + $staticInitialised = $globalContext; + } + + # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays + $extratags = array_flip( $extratags ); + $removetags = array_flip( $removetags ); + $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); + $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); + + return [ + 'htmlpairs' => $htmlpairs, + 'htmlsingle' => $htmlsingle, + 'htmlsingleonly' => $htmlsingleonly, + 'htmlnest' => $htmlnest, + 'tabletags' => $tabletags, + 'htmllist' => $htmllist, + 'listtags' => $listtags, + 'htmlsingleallowed' => $htmlsingleallowed, + 'htmlelements' => $htmlelements, + ]; + } + + /** + * Cleans up HTML, removes dangerous tags and attributes, and + * removes HTML comments + * @param string $text + * @param callable $processCallback Callback to do any variable or parameter + * replacements in HTML attribute values + * @param array|bool $args Arguments for the processing callback + * @param array $extratags For any extra tags to include + * @param array $removetags For any tags (default or extra) to exclude + * @param callable $warnCallback (Deprecated) Callback allowing the + * addition of a tracking category when bad input is encountered. + * DO NOT ADD NEW PARAMETERS AFTER $warnCallback, since it will be + * removed shortly. + * @return string + */ + public static function removeHTMLtags( $text, $processCallback = null, + $args = [], $extratags = [], $removetags = [], $warnCallback = null + ) { + $tagData = self::getRecognizedTagData( $extratags, $removetags ); + $htmlpairs = $tagData['htmlpairs']; + $htmlsingle = $tagData['htmlsingle']; + $htmlsingleonly = $tagData['htmlsingleonly']; + $htmlnest = $tagData['htmlnest']; + $tabletags = $tagData['tabletags']; + $htmllist = $tagData['htmllist']; + $listtags = $tagData['listtags']; + $htmlsingleallowed = $tagData['htmlsingleallowed']; + $htmlelements = $tagData['htmlelements']; + + # Remove HTML comments + $text = self::removeHTMLcomments( $text ); + $bits = explode( '<', $text ); + $text = str_replace( '>', '>', array_shift( $bits ) ); + if ( !MWTidy::isEnabled() ) { + $tagstack = $tablestack = []; + foreach ( $bits as $x ) { + $regs = []; + # $slash: Does the current element start with a '/'? + # $t: Current element name + # $params: String between element name and > + # $brace: Ending '>' or '/>' + # $rest: Everything until the next element of $bits + if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) { + list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; + } else { + $slash = $t = $params = $brace = $rest = null; + } + + $badtag = false; + $t = strtolower( $t ); + if ( isset( $htmlelements[$t] ) ) { + # Check our stack + if ( $slash && isset( $htmlsingleonly[$t] ) ) { + $badtag = true; + } elseif ( $slash ) { + # Closing a tag... is it the one we just opened? + Wikimedia\suppressWarnings(); + $ot = array_pop( $tagstack ); + Wikimedia\restoreWarnings(); + + if ( $ot != $t ) { + if ( isset( $htmlsingleallowed[$ot] ) ) { + # Pop all elements with an optional close tag + # and see if we find a match below them + $optstack = []; + array_push( $optstack, $ot ); + Wikimedia\suppressWarnings(); + $ot = array_pop( $tagstack ); + Wikimedia\restoreWarnings(); + while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { + array_push( $optstack, $ot ); + Wikimedia\suppressWarnings(); + $ot = array_pop( $tagstack ); + Wikimedia\restoreWarnings(); + } + if ( $t != $ot ) { + # No match. Push the optional elements back again + $badtag = true; + Wikimedia\suppressWarnings(); + $ot = array_pop( $optstack ); + Wikimedia\restoreWarnings(); + while ( $ot ) { + array_push( $tagstack, $ot ); + Wikimedia\suppressWarnings(); + $ot = array_pop( $optstack ); + Wikimedia\restoreWarnings(); + } + } + } else { + Wikimedia\suppressWarnings(); + array_push( $tagstack, $ot ); + Wikimedia\restoreWarnings(); + + # <li> can be nested in <ul> or <ol>, skip those cases: + if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) { + $badtag = true; + } + } + } else { + if ( $t == 'table' ) { + $tagstack = array_pop( $tablestack ); + } + } + $newparams = ''; + } else { + # Keep track for later + if ( isset( $tabletags[$t] ) && !in_array( 'table', $tagstack ) ) { + $badtag = true; + } elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) { + $badtag = true; + # Is it a self closed htmlpair ? (T7487) + } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) { + // Eventually we'll just remove the self-closing + // slash, in order to be consistent with HTML5 + // semantics. + // $brace = '>'; + // For now, let's just warn authors to clean up. + if ( is_callable( $warnCallback ) ) { + call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] ); + } + $badtag = true; + } elseif ( isset( $htmlsingleonly[$t] ) ) { + # Hack to force empty tag for unclosable elements + $brace = '/>'; + } elseif ( isset( $htmlsingle[$t] ) ) { + # Hack to not close $htmlsingle tags + $brace = null; + # Still need to push this optionally-closed tag to + # the tag stack so that we can match end tags + # instead of marking them as bad. + array_push( $tagstack, $t ); + } elseif ( isset( $tabletags[$t] ) && in_array( $t, $tagstack ) ) { + // New table tag but forgot to close the previous one + $text .= "</$t>"; + } else { + if ( $t == 'table' ) { + array_push( $tablestack, $tagstack ); + $tagstack = []; + } + array_push( $tagstack, $t ); + } + + # Replace any variables or template parameters with + # plaintext results. + if ( is_callable( $processCallback ) ) { + call_user_func_array( $processCallback, [ &$params, $args ] ); + } + + if ( !self::validateTag( $params, $t ) ) { + $badtag = true; + } + + # Strip non-approved attributes from the tag + $newparams = self::fixTagAttributes( $params, $t ); + } + if ( !$badtag ) { + $rest = str_replace( '>', '>', $rest ); + $close = ( $brace == '/>' && !$slash ) ? ' /' : ''; + $text .= "<$slash$t$newparams$close>$rest"; + continue; + } + } + $text .= '<' . str_replace( '>', '>', $x ); + } + # Close off any remaining tags + while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) { + $text .= "</$t>\n"; + if ( $t == 'table' ) { + $tagstack = array_pop( $tablestack ); + } + } + } else { + # this might be possible using tidy itself + foreach ( $bits as $x ) { + if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) { + list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; + + $badtag = false; + $t = strtolower( $t ); + if ( isset( $htmlelements[$t] ) ) { + if ( is_callable( $processCallback ) ) { + call_user_func_array( $processCallback, [ &$params, $args ] ); + } + + if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) { + // Eventually we'll just remove the self-closing + // slash, in order to be consistent with HTML5 + // semantics. + // $brace = '>'; + // For now, let's just warn authors to clean up. + if ( is_callable( $warnCallback ) ) { + call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] ); + } + } + if ( !self::validateTag( $params, $t ) ) { + $badtag = true; + } + + $newparams = self::fixTagAttributes( $params, $t ); + if ( !$badtag ) { + if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) { + # Interpret self-closing tags as empty tags even when + # HTML 5 would interpret them as start tags. Such input + # is commonly seen on Wikimedia wikis with this intention. + $brace = "></$t>"; + } + + $rest = str_replace( '>', '>', $rest ); + $text .= "<$slash$t$newparams$brace$rest"; + continue; + } + } + } + $text .= '<' . str_replace( '>', '>', $x ); + } + } + return $text; + } + + /** + * Remove '<!--', '-->', and everything between. + * To avoid leaving blank lines, when a comment is both preceded + * and followed by a newline (ignoring spaces), trim leading and + * trailing spaces and one of the newlines. + * + * @param string $text + * @return string + */ + public static function removeHTMLcomments( $text ) { + while ( ( $start = strpos( $text, '<!--' ) ) !== false ) { + $end = strpos( $text, '-->', $start + 4 ); + if ( $end === false ) { + # Unterminated comment; bail out + break; + } + + $end += 3; + + # Trim space and newline if the comment is both + # preceded and followed by a newline + $spaceStart = max( $start - 1, 0 ); + $spaceLen = $end - $spaceStart; + while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) { + $spaceStart--; + $spaceLen++; + } + while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) { + $spaceLen++; + } + if ( substr( $text, $spaceStart, 1 ) === "\n" + && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) { + # Remove the comment, leading and trailing + # spaces, and leave only one newline. + $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 ); + } else { + # Remove just the comment. + $text = substr_replace( $text, '', $start, $end - $start ); + } + } + return $text; + } + + /** + * Takes attribute names and values for a tag and the tag name and + * validates that the tag is allowed to be present. + * This DOES NOT validate the attributes, nor does it validate the + * tags themselves. This method only handles the special circumstances + * where we may want to allow a tag within content but ONLY when it has + * specific attributes set. + * + * @param string $params + * @param string $element + * @return bool + */ + static function validateTag( $params, $element ) { + $params = self::decodeTagAttributes( $params ); + + if ( $element == 'meta' || $element == 'link' ) { + if ( !isset( $params['itemprop'] ) ) { + // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content + return false; + } + if ( $element == 'meta' && !isset( $params['content'] ) ) { + // <meta> must have a content="" for the itemprop + return false; + } + if ( $element == 'link' && !isset( $params['href'] ) ) { + // <link> must have an associated href="" + return false; + } + } + + return true; + } + + /** + * Take an array of attribute names and values and normalize or discard + * illegal values for the given element type. + * + * - Discards attributes not on a whitelist for the given element + * - Unsafe style attributes are discarded + * - Invalid id attributes are re-encoded + * + * @param array $attribs + * @param string $element + * @return array + * + * @todo Check for legal values where the DTD limits things. + * @todo Check for unique id attribute :P + */ + static function validateTagAttributes( $attribs, $element ) { + return self::validateAttributes( $attribs, + self::attributeWhitelist( $element ) ); + } + + /** + * Take an array of attribute names and values and normalize or discard + * illegal values for the given whitelist. + * + * - Discards attributes not on the given whitelist + * - Unsafe style attributes are discarded + * - Invalid id attributes are re-encoded + * + * @param array $attribs + * @param array $whitelist List of allowed attribute names + * @return array + * + * @todo Check for legal values where the DTD limits things. + * @todo Check for unique id attribute :P + */ + static function validateAttributes( $attribs, $whitelist ) { + $whitelist = array_flip( $whitelist ); + $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; + + $out = []; + foreach ( $attribs as $attribute => $value ) { + # Allow XML namespace declaration to allow RDFa + if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) { + if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) { + $out[$attribute] = $value; + } + + continue; + } + + # Allow any attribute beginning with "data-" + # However: + # * Disallow data attributes used by MediaWiki code + # * Ensure that the attribute is not namespaced by banning + # colons. + if ( !preg_match( '/^data-[^:]*$/i', $attribute ) + && !isset( $whitelist[$attribute] ) + || self::isReservedDataAttribute( $attribute ) + ) { + continue; + } + + # Strip javascript "expression" from stylesheets. + # https://msdn.microsoft.com/en-us/library/ms537634.aspx + if ( $attribute == 'style' ) { + $value = self::checkCss( $value ); + } + + # Escape HTML id attributes + if ( $attribute === 'id' ) { + $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY ); + } + + # Escape HTML id reference lists + if ( $attribute === 'aria-describedby' + || $attribute === 'aria-flowto' + || $attribute === 'aria-labelledby' + || $attribute === 'aria-owns' + ) { + $value = self::escapeIdReferenceList( $value ); + } + + // RDFa and microdata properties allow URLs, URIs and/or CURIs. + // Check them for sanity. + if ( $attribute === 'rel' || $attribute === 'rev' + # RDFa + || $attribute === 'about' || $attribute === 'property' + || $attribute === 'resource' || $attribute === 'datatype' + || $attribute === 'typeof' + # HTML5 microdata + || $attribute === 'itemid' || $attribute === 'itemprop' + || $attribute === 'itemref' || $attribute === 'itemscope' + || $attribute === 'itemtype' + ) { + // Paranoia. Allow "simple" values but suppress javascript + if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { + continue; + } + } + + # NOTE: even though elements using href/src are not allowed directly, supply + # validation code that can be used by tag hook handlers, etc + if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) { + if ( !preg_match( $hrefExp, $value ) ) { + continue; // drop any href or src attributes not using an allowed protocol. + // NOTE: this also drops all relative URLs + } + } + + // If this attribute was previously set, override it. + // Output should only have one attribute of each name. + $out[$attribute] = $value; + } + + # itemtype, itemid, itemref don't make sense without itemscope + if ( !array_key_exists( 'itemscope', $out ) ) { + unset( $out['itemtype'] ); + unset( $out['itemid'] ); + unset( $out['itemref'] ); + } + # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. + + return $out; + } + + /** + * Given an attribute name, checks whether it is a reserved data attribute + * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki + * core and extension code can safely use it to communicate with frontend code. + * @param string $attr Attribute name. + * @return bool + */ + public static function isReservedDataAttribute( $attr ) { + // data-ooui is reserved for ooui. + // data-mw and data-parsoid are reserved for parsoid. + // data-mw-<name here> is reserved for extensions (or core) if + // they need to communicate some data to the client and want to be + // sure that it isn't coming from an untrusted user. + // We ignore the possibility of namespaces since user-generated HTML + // can't use them anymore. + return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr ); + } + + /** + * Merge two sets of HTML attributes. Conflicting items in the second set + * will override those in the first, except for 'class' attributes which + * will be combined (if they're both strings). + * + * @todo implement merging for other attributes such as style + * @param array $a + * @param array $b + * @return array + */ + static function mergeAttributes( $a, $b ) { + $out = array_merge( $a, $b ); + if ( isset( $a['class'] ) && isset( $b['class'] ) + && is_string( $a['class'] ) && is_string( $b['class'] ) + && $a['class'] !== $b['class'] + ) { + $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}", + -1, PREG_SPLIT_NO_EMPTY ); + $out['class'] = implode( ' ', array_unique( $classes ) ); + } + return $out; + } + + /** + * Normalize CSS into a format we can easily search for hostile input + * - decode character references + * - decode escape sequences + * - convert characters that IE6 interprets into ascii + * - remove comments, unless the entire value is one single comment + * @param string $value the css string + * @return string normalized css + */ + public static function normalizeCss( $value ) { + // Decode character references like { + $value = self::decodeCharReferences( $value ); + + // Decode escape sequences and line continuation + // See the grammar in the CSS 2 spec, appendix D. + // This has to be done AFTER decoding character references. + // This means it isn't possible for this function to return + // unsanitized escape sequences. It is possible to manufacture + // input that contains character references that decode to + // escape sequences that decode to character references, but + // it's OK for the return value to contain character references + // because the caller is supposed to escape those anyway. + static $decodeRegex; + if ( !$decodeRegex ) { + $space = '[\\x20\\t\\r\\n\\f]'; + $nl = '(?:\\n|\\r\\n|\\r|\\f)'; + $backslash = '\\\\'; + $decodeRegex = "/ $backslash + (?: + ($nl) | # 1. Line continuation + ([0-9A-Fa-f]{1,6})$space? | # 2. character number + (.) | # 3. backslash cancelling special meaning + () | # 4. backslash at end of string + )/xu"; + } + $value = preg_replace_callback( $decodeRegex, + [ __CLASS__, 'cssDecodeCallback' ], $value ); + + // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii + $value = preg_replace_callback( + '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (T60088) + function ( $matches ) { + $cp = UtfNormal\Utils::utf8ToCodepoint( $matches[0] ); + if ( $cp === false ) { + return ''; + } + return chr( $cp - 65248 ); // ASCII range \x21-\x7A + }, + $value + ); + + // Convert more characters IE6 might treat as ascii + // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D + $value = str_replace( + [ 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ], + [ 'r', 'n', 'n', 'l', 'i', '(', '(' ], + $value + ); + + // Let the value through if it's nothing but a single comment, to + // allow other functions which may reject it to pass some error + // message through. + if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) { + // Remove any comments; IE gets token splitting wrong + // This must be done AFTER decoding character references and + // escape sequences, because those steps can introduce comments + // This step cannot introduce character references or escape + // sequences, because it replaces comments with spaces rather + // than removing them completely. + $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); + + // Remove anything after a comment-start token, to guard against + // incorrect client implementations. + $commentPos = strpos( $value, '/*' ); + if ( $commentPos !== false ) { + $value = substr( $value, 0, $commentPos ); + } + } + + // S followed by repeat, iteration, or prolonged sound marks, + // which IE will treat as "ss" + $value = preg_replace( + '/s(?: + \xE3\x80\xB1 | # U+3031 + \xE3\x82\x9D | # U+309D + \xE3\x83\xBC | # U+30FC + \xE3\x83\xBD | # U+30FD + \xEF\xB9\xBC | # U+FE7C + \xEF\xB9\xBD | # U+FE7D + \xEF\xBD\xB0 # U+FF70 + )/ix', + 'ss', + $value + ); + + return $value; + } + + /** + * Pick apart some CSS and check it for forbidden or unsafe structures. + * Returns a sanitized string. This sanitized string will have + * character references and escape sequences decoded and comments + * stripped (unless it is itself one valid comment, in which case the value + * will be passed through). If the input is just too evil, only a comment + * complaining about evilness will be returned. + * + * Currently URL references, 'expression', 'tps' are forbidden. + * + * NOTE: Despite the fact that character references are decoded, the + * returned string may contain character references given certain + * clever input strings. These character references must + * be escaped before the return value is embedded in HTML. + * + * @param string $value + * @return string + */ + static function checkCss( $value ) { + $value = self::normalizeCss( $value ); + + // Reject problematic keywords and control characters + if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) || + strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) { + return '/* invalid control char */'; + } elseif ( preg_match( + '! expression + | filter\s*: + | accelerator\s*: + | -o-link\s*: + | -o-link-source\s*: + | -o-replace\s*: + | image\s*\( + | image-set\s*\( + | attr\s*\([^)]+[\s,]+url + | var\s*\( + !ix', $value ) ) { + return '/* insecure input */'; + } + return $value; + } + + /** + * @param array $matches + * @return string + */ + static function cssDecodeCallback( $matches ) { + if ( $matches[1] !== '' ) { + // Line continuation + return ''; + } elseif ( $matches[2] !== '' ) { + $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) ); + } elseif ( $matches[3] !== '' ) { + $char = $matches[3]; + } else { + $char = '\\'; + } + if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { + // These characters need to be escaped in strings + // Clean up the escape sequence to avoid parsing errors by clients + return '\\' . dechex( ord( $char ) ) . ' '; + } else { + // Decode unnecessary escape + return $char; + } + } + + /** + * Take a tag soup fragment listing an HTML element's attributes + * and normalize it to well-formed XML, discarding unwanted attributes. + * Output is safe for further wikitext processing, with escaping of + * values that could trigger problems. + * + * - Normalizes attribute names to lowercase + * - Discards attributes not on a whitelist for the given element + * - Turns broken or invalid entities into plaintext + * - Double-quotes all attribute values + * - Attributes without values are given the name as attribute + * - Double attributes are discarded + * - Unsafe style attributes are discarded + * - Prepends space if there are attributes. + * - (Optionally) Sorts attributes by name. + * + * @param string $text + * @param string $element + * @param bool $sorted Whether to sort the attributes (default: false) + * @return string + */ + static function fixTagAttributes( $text, $element, $sorted = false ) { + if ( trim( $text ) == '' ) { + return ''; + } + + $decoded = self::decodeTagAttributes( $text ); + $stripped = self::validateTagAttributes( $decoded, $element ); + + if ( $sorted ) { + ksort( $stripped ); + } + + return self::safeEncodeTagAttributes( $stripped ); + } + + /** + * Encode an attribute value for HTML output. + * @param string $text + * @return string HTML-encoded text fragment + */ + static function encodeAttribute( $text ) { + $encValue = htmlspecialchars( $text, ENT_QUOTES ); + + // Whitespace is normalized during attribute decoding, + // so if we've been passed non-spaces we must encode them + // ahead of time or they won't be preserved. + $encValue = strtr( $encValue, [ + "\n" => ' ', + "\r" => ' ', + "\t" => '	', + ] ); + + return $encValue; + } + + /** + * Encode an attribute value for HTML tags, with extra armoring + * against further wiki processing. + * @param string $text + * @return string HTML-encoded text fragment + */ + static function safeEncodeAttribute( $text ) { + $encValue = self::encodeAttribute( $text ); + + # Templates and links may be expanded in later parsing, + # creating invalid or dangerous output. Suppress this. + $encValue = strtr( $encValue, [ + '<' => '<', // This should never happen, + '>' => '>', // we've received invalid input + '"' => '"', // which should have been escaped. + '{' => '{', + '}' => '}', // prevent unpaired language conversion syntax + '[' => '[', + ']' => ']', + "''" => '''', + 'ISBN' => 'ISBN', + 'RFC' => 'RFC', + 'PMID' => 'PMID', + '|' => '|', + '__' => '__', + ] ); + + # Stupid hack + $encValue = preg_replace_callback( + '/((?i)' . wfUrlProtocols() . ')/', + function ( $matches ) { + return str_replace( ':', ':', $matches[1] ); + }, + $encValue ); + return $encValue; + } + + /** + * Given a value, escape it so that it can be used in an id attribute and + * return it. This will use HTML5 validation if $wgExperimentalHtmlIds is + * true, allowing anything but ASCII whitespace. Otherwise it will use + * HTML 4 rules, which means a narrow subset of ASCII, with bad characters + * escaped with lots of dots. + * + * To ensure we don't have to bother escaping anything, we also strip ', ", + * & even if $wgExperimentalIds is true. TODO: Is this the best tactic? + * We also strip # because it upsets IE, and % because it could be + * ambiguous if it's part of something that looks like a percent escape + * (which don't work reliably in fragments cross-browser). + * + * @deprecated since 1.30, use one of this class' escapeIdFor*() functions + * + * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters + * in the id and name attributes + * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with + * the id attribute + * @see https://www.w3.org/TR/html5/dom.html#the-id-attribute + * HTML5 definition of id attribute + * + * @param string $id Id to escape + * @param string|array $options String or array of strings (default is array()): + * 'noninitial': This is a non-initial fragment of an id, not a full id, + * so don't pay attention if the first character isn't valid at the + * beginning of an id. Only matters if $wgExperimentalHtmlIds is + * false. + * 'legacy': Behave the way the old HTML 4-based ID escaping worked even + * if $wgExperimentalHtmlIds is used, so we can generate extra + * anchors and links won't break. + * @return string + */ + static function escapeId( $id, $options = [] ) { + global $wgExperimentalHtmlIds; + $options = (array)$options; + + if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { + $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); + $id = trim( $id, '_' ); + if ( $id === '' ) { + // Must have been all whitespace to start with. + return '_'; + } else { + return $id; + } + } + + // HTML4-style escaping + static $replace = [ + '%3A' => ':', + '%' => '.' + ]; + + $id = urlencode( strtr( $id, ' ', '_' ) ); + $id = strtr( $id, $replace ); + + if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) { + // Initial character must be a letter! + $id = "x$id"; + } + return $id; + } + + /** + * Given a section name or other user-generated or otherwise unsafe string, escapes it to be + * a valid HTML id attribute. + * + * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, + * be sure to use proper escaping. + * + * @param string $id String to escape + * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding + * should be used. + * @return string|bool Escaped ID or false if fallback encoding is requested but it's not + * configured. + * + * @since 1.30 + */ + public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) { + global $wgFragmentMode; + + if ( !isset( $wgFragmentMode[$mode] ) ) { + if ( $mode === self::ID_PRIMARY ) { + throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); + } + return false; + } + + $internalMode = $wgFragmentMode[$mode]; + + return self::escapeIdInternal( $id, $internalMode ); + } + + /** + * Given a section name or other user-generated or otherwise unsafe string, escapes it to be + * a valid URL fragment. + * + * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, + * be sure to use proper escaping. + * + * @param string $id String to escape + * @return string Escaped ID + * + * @since 1.30 + */ + public static function escapeIdForLink( $id ) { + global $wgFragmentMode; + + if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) { + throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); + } + + $mode = $wgFragmentMode[self::ID_PRIMARY]; + + $id = self::escapeIdInternal( $id, $mode ); + + return $id; + } + + /** + * Given a section name or other user-generated or otherwise unsafe string, escapes it to be + * a valid URL fragment for external interwikis. + * + * @param string $id String to escape + * @return string Escaped ID + * + * @since 1.30 + */ + public static function escapeIdForExternalInterwiki( $id ) { + global $wgExternalInterwikiFragmentMode; + + $id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode ); + + return $id; + } + + /** + * Helper for escapeIdFor*() functions. Performs most of the actual escaping. + * + * @param string $id String to escape + * @param string $mode One of modes from $wgFragmentMode + * @return string + */ + private static function escapeIdInternal( $id, $mode ) { + switch ( $mode ) { + case 'html5': + $id = str_replace( ' ', '_', $id ); + break; + case 'legacy': + // This corresponds to 'noninitial' mode of the old escapeId() + static $replace = [ + '%3A' => ':', + '%' => '.' + ]; + + $id = urlencode( str_replace( ' ', '_', $id ) ); + $id = strtr( $id, $replace ); + break; + case 'html5-legacy': + $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); + $id = trim( $id, '_' ); + if ( $id === '' ) { + // Must have been all whitespace to start with. + $id = '_'; + } + break; + default: + throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ ); + } + + return $id; + } + + /** + * Given a string containing a space delimited list of ids, escape each id + * to match ids escaped by the escapeId() function. + * + * @todo remove $options completely in 1.32 + * + * @since 1.27 + * + * @param string $referenceString Space delimited list of ids + * @param string|array $options Deprecated and does nothing. + * @return string + */ + static function escapeIdReferenceList( $referenceString, $options = [] ) { + if ( $options ) { + wfDeprecated( __METHOD__ . ' with $options', '1.31' ); + } + # Explode the space delimited list string into an array of tokens + $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY ); + + # Escape each token as an id + foreach ( $references as &$ref ) { + $ref = self::escapeIdForAttribute( $ref ); + } + + # Merge the array back to a space delimited list string + # If the array is empty, the result will be an empty string ('') + $referenceString = implode( ' ', $references ); + + return $referenceString; + } + + /** + * Given a value, escape it so that it can be used as a CSS class and + * return it. + * + * @todo For extra validity, input should be validated UTF-8. + * + * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format + * + * @param string $class + * @return string + */ + static function escapeClass( $class ) { + // Convert ugly stuff to underscores and kill underscores in ugly places + return rtrim( preg_replace( + [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ], + '_', + $class ), '_' ); + } + + /** + * Given HTML input, escape with htmlspecialchars but un-escape entities. + * This allows (generally harmless) entities like   to survive. + * + * @param string $html HTML to escape + * @return string Escaped input + */ + static function escapeHtmlAllowEntities( $html ) { + $html = self::decodeCharReferences( $html ); + # It seems wise to escape ' as well as ", as a matter of course. Can't + # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters + # don't cause the entire string to disappear. + $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE ); + return $html; + } + + /** + * Return an associative array of attribute names and values from + * a partial tag string. Attribute names are forced to lowercase, + * character references are decoded to UTF-8 text. + * + * @param string $text + * @return array + */ + public static function decodeTagAttributes( $text ) { + if ( trim( $text ) == '' ) { + return []; + } + + $attribs = []; + $pairs = []; + if ( !preg_match_all( + self::getAttribsRegex(), + $text, + $pairs, + PREG_SET_ORDER ) ) { + return $attribs; + } + + foreach ( $pairs as $set ) { + $attribute = strtolower( $set[1] ); + $value = self::getTagAttributeCallback( $set ); + + // Normalize whitespace + $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); + $value = trim( $value ); + + // Decode character references + $attribs[$attribute] = self::decodeCharReferences( $value ); + } + return $attribs; + } + + /** + * Build a partial tag string from an associative array of attribute + * names and values as returned by decodeTagAttributes. + * + * @param array $assoc_array + * @return string + */ + public static function safeEncodeTagAttributes( $assoc_array ) { + $attribs = []; + foreach ( $assoc_array as $attribute => $value ) { + $encAttribute = htmlspecialchars( $attribute ); + $encValue = self::safeEncodeAttribute( $value ); + + $attribs[] = "$encAttribute=\"$encValue\""; + } + return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; + } + + /** + * Pick the appropriate attribute value from a match set from the + * attribs regex matches. + * + * @param array $set + * @throws MWException When tag conditions are not met. + * @return string + */ + private static function getTagAttributeCallback( $set ) { + if ( isset( $set[5] ) ) { + # No quotes. + return $set[5]; + } elseif ( isset( $set[4] ) ) { + # Single-quoted + return $set[4]; + } elseif ( isset( $set[3] ) ) { + # Double-quoted + return $set[3]; + } elseif ( !isset( $set[2] ) ) { + # In XHTML, attributes must have a value so return an empty string. + # See "Empty attribute syntax", + # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name + return ""; + } else { + throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); + } + } + + /** + * @param string $text + * @return string + */ + private static function normalizeWhitespace( $text ) { + return preg_replace( + '/\r\n|[\x20\x0d\x0a\x09]/', + ' ', + $text ); + } + + /** + * Normalizes whitespace in a section name, such as might be returned + * by Parser::stripSectionName(), for use in the id's that are used for + * section links. + * + * @param string $section + * @return string + */ + static function normalizeSectionNameWhitespace( $section ) { + return trim( preg_replace( '/[ _]+/', ' ', $section ) ); + } + + /** + * Ensure that any entities and character references are legal + * for XML and XHTML specifically. Any stray bits will be + * &-escaped to result in a valid text fragment. + * + * a. named char refs can only be < > & ", others are + * numericized (this way we're well-formed even without a DTD) + * b. any numeric char refs must be legal chars, not invalid or forbidden + * c. use lower cased "&#x", not "&#X" + * d. fix or reject non-valid attributes + * + * @param string $text + * @return string + * @private + */ + static function normalizeCharReferences( $text ) { + return preg_replace_callback( + self::CHAR_REFS_REGEX, + [ self::class, 'normalizeCharReferencesCallback' ], + $text ); + } + + /** + * @param string $matches + * @return string + */ + static function normalizeCharReferencesCallback( $matches ) { + $ret = null; + if ( $matches[1] != '' ) { + $ret = self::normalizeEntity( $matches[1] ); + } elseif ( $matches[2] != '' ) { + $ret = self::decCharReference( $matches[2] ); + } elseif ( $matches[3] != '' ) { + $ret = self::hexCharReference( $matches[3] ); + } + if ( is_null( $ret ) ) { + return htmlspecialchars( $matches[0] ); + } else { + return $ret; + } + } + + /** + * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, + * return the equivalent numeric entity reference (except for the core < + * > & "). If the entity is a MediaWiki-specific alias, returns + * the HTML equivalent. Otherwise, returns HTML-escaped text of + * pseudo-entity source (eg &foo;) + * + * @param string $name + * @return string + */ + static function normalizeEntity( $name ) { + if ( isset( self::$htmlEntityAliases[$name] ) ) { + return '&' . self::$htmlEntityAliases[$name] . ';'; + } elseif ( in_array( $name, [ 'lt', 'gt', 'amp', 'quot' ] ) ) { + return "&$name;"; + } elseif ( isset( self::$htmlEntities[$name] ) ) { + return '&#' . self::$htmlEntities[$name] . ';'; + } else { + return "&$name;"; + } + } + + /** + * @param int $codepoint + * @return null|string + */ + static function decCharReference( $codepoint ) { + $point = intval( $codepoint ); + if ( self::validateCodepoint( $point ) ) { + return sprintf( '&#%d;', $point ); + } else { + return null; + } + } + + /** + * @param int $codepoint + * @return null|string + */ + static function hexCharReference( $codepoint ) { + $point = hexdec( $codepoint ); + if ( self::validateCodepoint( $point ) ) { + return sprintf( '&#x%x;', $point ); + } else { + return null; + } + } + + /** + * Returns true if a given Unicode codepoint is a valid character in + * both HTML5 and XML. + * @param int $codepoint + * @return bool + */ + private static function validateCodepoint( $codepoint ) { + # U+000C is valid in HTML5 but not allowed in XML. + # U+000D is valid in XML but not allowed in HTML5. + # U+007F - U+009F are disallowed in HTML5 (control characters). + return $codepoint == 0x09 + || $codepoint == 0x0a + || ( $codepoint >= 0x20 && $codepoint <= 0x7e ) + || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff ) + || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) + || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); + } + + /** + * Decode any character references, numeric or named entities, + * in the text and return a UTF-8 string. + * + * @param string $text + * @return string + */ + public static function decodeCharReferences( $text ) { + return preg_replace_callback( + self::CHAR_REFS_REGEX, + [ self::class, 'decodeCharReferencesCallback' ], + $text ); + } + + /** + * Decode any character references, numeric or named entities, + * in the next and normalize the resulting string. (T16952) + * + * This is useful for page titles, not for text to be displayed, + * MediaWiki allows HTML entities to escape normalization as a feature. + * + * @param string $text Already normalized, containing entities + * @return string Still normalized, without entities + */ + public static function decodeCharReferencesAndNormalize( $text ) { + global $wgContLang; + $text = preg_replace_callback( + self::CHAR_REFS_REGEX, + [ self::class, 'decodeCharReferencesCallback' ], + $text, + -1, //limit + $count + ); + + if ( $count ) { + return $wgContLang->normalize( $text ); + } else { + return $text; + } + } + + /** + * @param string $matches + * @return string + */ + static function decodeCharReferencesCallback( $matches ) { + if ( $matches[1] != '' ) { + return self::decodeEntity( $matches[1] ); + } elseif ( $matches[2] != '' ) { + return self::decodeChar( intval( $matches[2] ) ); + } elseif ( $matches[3] != '' ) { + return self::decodeChar( hexdec( $matches[3] ) ); + } + # Last case should be an ampersand by itself + return $matches[0]; + } + + /** + * Return UTF-8 string for a codepoint if that is a valid + * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. + * @param int $codepoint + * @return string + * @private + */ + static function decodeChar( $codepoint ) { + if ( self::validateCodepoint( $codepoint ) ) { + return UtfNormal\Utils::codepointToUtf8( $codepoint ); + } else { + return UtfNormal\Constants::UTF8_REPLACEMENT; + } + } + + /** + * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, + * return the UTF-8 encoding of that character. Otherwise, returns + * pseudo-entity source (eg "&foo;") + * + * @param string $name + * @return string + */ + static function decodeEntity( $name ) { + if ( isset( self::$htmlEntityAliases[$name] ) ) { + $name = self::$htmlEntityAliases[$name]; + } + if ( isset( self::$htmlEntities[$name] ) ) { + return UtfNormal\Utils::codepointToUtf8( self::$htmlEntities[$name] ); + } else { + return "&$name;"; + } + } + + /** + * Fetch the whitelist of acceptable attributes for a given element name. + * + * @param string $element + * @return array + */ + static function attributeWhitelist( $element ) { + $list = self::setupAttributeWhitelist(); + return isset( $list[$element] ) + ? $list[$element] + : []; + } + + /** + * Foreach array key (an allowed HTML element), return an array + * of allowed attributes + * @return array + */ + static function setupAttributeWhitelist() { + static $whitelist; + + if ( $whitelist !== null ) { + return $whitelist; + } + + $common = [ + # HTML + 'id', + 'class', + 'style', + 'lang', + 'dir', + 'title', + + # WAI-ARIA + 'aria-describedby', + 'aria-flowto', + 'aria-label', + 'aria-labelledby', + 'aria-owns', + 'role', + + # RDFa + # These attributes are specified in section 9 of + # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 + 'about', + 'property', + 'resource', + 'datatype', + 'typeof', + + # Microdata. These are specified by + # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model + 'itemid', + 'itemprop', + 'itemref', + 'itemscope', + 'itemtype', + ]; + + $block = array_merge( $common, [ 'align' ] ); + $tablealign = [ 'align', 'valign' ]; + $tablecell = [ + 'abbr', + 'axis', + 'headers', + 'scope', + 'rowspan', + 'colspan', + 'nowrap', # deprecated + 'width', # deprecated + 'height', # deprecated + 'bgcolor', # deprecated + ]; + + # Numbers refer to sections in HTML 4.01 standard describing the element. + # See: https://www.w3.org/TR/html4/ + $whitelist = [ + # 7.5.4 + 'div' => $block, + 'center' => $common, # deprecated + 'span' => $common, + + # 7.5.5 + 'h1' => $block, + 'h2' => $block, + 'h3' => $block, + 'h4' => $block, + 'h5' => $block, + 'h6' => $block, + + # 7.5.6 + # address + + # 8.2.4 + 'bdo' => $common, + + # 9.2.1 + 'em' => $common, + 'strong' => $common, + 'cite' => $common, + 'dfn' => $common, + 'code' => $common, + 'samp' => $common, + 'kbd' => $common, + 'var' => $common, + 'abbr' => $common, + # acronym + + # 9.2.2 + 'blockquote' => array_merge( $common, [ 'cite' ] ), + 'q' => array_merge( $common, [ 'cite' ] ), + + # 9.2.3 + 'sub' => $common, + 'sup' => $common, + + # 9.3.1 + 'p' => $block, + + # 9.3.2 + 'br' => array_merge( $common, [ 'clear' ] ), + + # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element + 'wbr' => $common, + + # 9.3.4 + 'pre' => array_merge( $common, [ 'width' ] ), + + # 9.4 + 'ins' => array_merge( $common, [ 'cite', 'datetime' ] ), + 'del' => array_merge( $common, [ 'cite', 'datetime' ] ), + + # 10.2 + 'ul' => array_merge( $common, [ 'type' ] ), + 'ol' => array_merge( $common, [ 'type', 'start', 'reversed' ] ), + 'li' => array_merge( $common, [ 'type', 'value' ] ), + + # 10.3 + 'dl' => $common, + 'dd' => $common, + 'dt' => $common, + + # 11.2.1 + 'table' => array_merge( $common, + [ 'summary', 'width', 'border', 'frame', + 'rules', 'cellspacing', 'cellpadding', + 'align', 'bgcolor', + ] ), + + # 11.2.2 + 'caption' => $block, + + # 11.2.3 + 'thead' => $common, + 'tfoot' => $common, + 'tbody' => $common, + + # 11.2.4 + 'colgroup' => array_merge( $common, [ 'span' ] ), + 'col' => array_merge( $common, [ 'span' ] ), + + # 11.2.5 + 'tr' => array_merge( $common, [ 'bgcolor' ], $tablealign ), + + # 11.2.6 + 'td' => array_merge( $common, $tablecell, $tablealign ), + 'th' => array_merge( $common, $tablecell, $tablealign ), + + # 12.2 + # NOTE: <a> is not allowed directly, but the attrib + # whitelist is used from the Parser object + 'a' => array_merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa + + # 13.2 + # Not usually allowed, but may be used for extension-style hooks + # such as <math> when it is rasterized, or if $wgAllowImageTag is + # true + 'img' => array_merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ), + + 'video' => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ), + 'source' => array_merge( $common, [ 'type', 'src' ] ), + 'track' => array_merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ), + + # 15.2.1 + 'tt' => $common, + 'b' => $common, + 'i' => $common, + 'big' => $common, + 'small' => $common, + 'strike' => $common, + 's' => $common, + 'u' => $common, + + # 15.2.2 + 'font' => array_merge( $common, [ 'size', 'color', 'face' ] ), + # basefont + + # 15.3 + 'hr' => array_merge( $common, [ 'width' ] ), + + # HTML Ruby annotation text module, simple ruby only. + # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element + 'ruby' => $common, + # rbc + 'rb' => $common, + 'rp' => $common, + 'rt' => $common, # array_merge( $common, array( 'rbspan' ) ), + 'rtc' => $common, + + # MathML root element, where used for extensions + # 'title' may not be 100% valid here; it's XHTML + # https://www.w3.org/TR/REC-MathML/ + 'math' => [ 'class', 'style', 'id', 'title' ], + + // HTML 5 section 4.5 + 'figure' => $common, + 'figcaption' => $common, + + # HTML 5 section 4.6 + 'bdi' => $common, + + # HTML5 elements, defined by: + # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element + 'data' => array_merge( $common, [ 'value' ] ), + 'time' => array_merge( $common, [ 'datetime' ] ), + 'mark' => $common, + + // meta and link are only permitted by removeHTMLtags when Microdata + // is enabled so we don't bother adding a conditional to hide these + // Also meta and link are only valid in WikiText as Microdata elements + // (ie: validateTag rejects tags missing the attributes needed for Microdata) + // So we don't bother including $common attributes that have no purpose. + 'meta' => [ 'itemprop', 'content' ], + 'link' => [ 'itemprop', 'href', 'title' ], + ]; + + return $whitelist; + } + + /** + * Take a fragment of (potentially invalid) HTML and return + * a version with any tags removed, encoded as plain text. + * + * Warning: this return value must be further escaped for literal + * inclusion in HTML output as of 1.10! + * + * @param string $html HTML fragment + * @return string + */ + static function stripAllTags( $html ) { + // Use RemexHtml to tokenize $html and extract the text + $handler = new RemexStripTagHandler; + $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [ + 'ignoreErrors' => true, + // don't ignore char refs, we want them to be decoded + 'ignoreNulls' => true, + 'skipPreprocess' => true, + ] ); + $tokenizer->execute(); + $text = $handler->getResult(); + + $text = self::normalizeWhitespace( $text ); + return $text; + } + + /** + * Hack up a private DOCTYPE with HTML's standard entity declarations. + * PHP 4 seemed to know these if you gave it an HTML doctype, but + * PHP 5.1 doesn't. + * + * Use for passing XHTML fragments to PHP's XML parsing functions + * + * @return string + */ + static function hackDocType() { + $out = "<!DOCTYPE html [\n"; + foreach ( self::$htmlEntities as $entity => $codepoint ) { + $out .= "<!ENTITY $entity \"&#$codepoint;\">"; + } + $out .= "]>\n"; + return $out; + } + + /** + * @param string $url + * @return mixed|string + */ + static function cleanUrl( $url ) { + # Normalize any HTML entities in input. They will be + # re-escaped by makeExternalLink(). + $url = self::decodeCharReferences( $url ); + + # Escape any control characters introduced by the above step + $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', + [ __CLASS__, 'cleanUrlCallback' ], $url ); + + # Validate hostname portion + $matches = []; + if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { + list( /* $whole */, $protocol, $host, $rest ) = $matches; + + // Characters that will be ignored in IDNs. + // https://tools.ietf.org/html/rfc3454#section-3.1 + // Strip them before further processing so blacklists and such work. + $strip = "/ + \\s| # general whitespace + \xc2\xad| # 00ad SOFT HYPHEN + \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN + \xe2\x80\x8b| # 200b ZERO WIDTH SPACE + \xe2\x81\xa0| # 2060 WORD JOINER + \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE + \xcd\x8f| # 034f COMBINING GRAPHEME JOINER + \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE + \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO + \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE + \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER + \xe2\x80\x8d| # 200d ZERO WIDTH JOINER + [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16 + /xuD"; + + $host = preg_replace( $strip, '', $host ); + + // IPv6 host names are bracketed with []. Url-decode these. + if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 && + preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches ) + ) { + $host = '//[' . $matches[1] . ']' . $matches[2]; + } + + // @todo FIXME: Validate hostnames here + + return $protocol . $host . $rest; + } else { + return $url; + } + } + + /** + * @param array $matches + * @return string + */ + static function cleanUrlCallback( $matches ) { + return urlencode( $matches[0] ); + } + + /** + * Does a string look like an e-mail address? + * + * This validates an email address using an HTML5 specification found at: + * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address + * Which as of 2011-01-24 says: + * + * A valid e-mail address is a string that matches the ABNF production + * 1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined + * in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section + * 3.5. + * + * This function is an implementation of the specification as requested in + * T24449. + * + * Client-side forms will use the same standard validation rules via JS or + * HTML 5 validation; additional restrictions can be enforced server-side + * by extensions via the 'isValidEmailAddr' hook. + * + * Note that this validation doesn't 100% match RFC 2822, but is believed + * to be liberal enough for wide use. Some invalid addresses will still + * pass validation here. + * + * @since 1.18 + * + * @param string $addr E-mail address + * @return bool + */ + public static function validateEmail( $addr ) { + $result = null; + if ( !Hooks::run( 'isValidEmailAddr', [ $addr, &$result ] ) ) { + return $result; + } + + // Please note strings below are enclosed in brackets [], this make the + // hyphen "-" a range indicator. Hence it is double backslashed below. + // See T28948 + $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~"; + $rfc1034_ldh_str = "a-z0-9\\-"; + + $html5_email_regexp = "/ + ^ # start of string + [$rfc5322_atext\\.]+ # user part which is liberal :p + @ # 'apostrophe' + [$rfc1034_ldh_str]+ # First domain part + (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot + $ # End of string + /ix"; // case Insensitive, eXtended + + return (bool)preg_match( $html5_email_regexp, $addr ); + } +} diff --git a/www/wiki/includes/parser/StripState.php b/www/wiki/includes/parser/StripState.php new file mode 100644 index 00000000..855ce1d5 --- /dev/null +++ b/www/wiki/includes/parser/StripState.php @@ -0,0 +1,297 @@ +<?php +/** + * Holder for stripped items when parsing wiki markup. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + */ + +/** + * @todo document, briefly. + * @ingroup Parser + */ +class StripState { + protected $data; + protected $regex; + + protected $parser; + + protected $circularRefGuard; + protected $depth = 0; + protected $highestDepth = 0; + protected $expandSize = 0; + + protected $depthLimit = 20; + protected $sizeLimit = 5000000; + + /** + * @param Parser|null $parser + * @param array $options + */ + public function __construct( Parser $parser = null, $options = [] ) { + $this->data = [ + 'nowiki' => [], + 'general' => [] + ]; + $this->regex = '/' . Parser::MARKER_PREFIX . "([^\x7f<>&'\"]+)" . Parser::MARKER_SUFFIX . '/'; + $this->circularRefGuard = []; + $this->parser = $parser; + + if ( isset( $options['depthLimit'] ) ) { + $this->depthLimit = $options['depthLimit']; + } + if ( isset( $options['sizeLimit'] ) ) { + $this->sizeLimit = $options['sizeLimit']; + } + } + + /** + * Add a nowiki strip item + * @param string $marker + * @param string $value + */ + public function addNoWiki( $marker, $value ) { + $this->addItem( 'nowiki', $marker, $value ); + } + + /** + * @param string $marker + * @param string $value + */ + public function addGeneral( $marker, $value ) { + $this->addItem( 'general', $marker, $value ); + } + + /** + * @throws MWException + * @param string $type + * @param string $marker + * @param string $value + */ + protected function addItem( $type, $marker, $value ) { + if ( !preg_match( $this->regex, $marker, $m ) ) { + throw new MWException( "Invalid marker: $marker" ); + } + + $this->data[$type][$m[1]] = $value; + } + + /** + * @param string $text + * @return mixed + */ + public function unstripGeneral( $text ) { + return $this->unstripType( 'general', $text ); + } + + /** + * @param string $text + * @return mixed + */ + public function unstripNoWiki( $text ) { + return $this->unstripType( 'nowiki', $text ); + } + + /** + * @param string $text + * @return mixed + */ + public function unstripBoth( $text ) { + $text = $this->unstripType( 'general', $text ); + $text = $this->unstripType( 'nowiki', $text ); + return $text; + } + + /** + * @param string $type + * @param string $text + * @return mixed + */ + protected function unstripType( $type, $text ) { + // Shortcut + if ( !count( $this->data[$type] ) ) { + return $text; + } + + $callback = function ( $m ) use ( $type ) { + $marker = $m[1]; + if ( isset( $this->data[$type][$marker] ) ) { + if ( isset( $this->circularRefGuard[$marker] ) ) { + return $this->getWarning( 'parser-unstrip-loop-warning' ); + } + + if ( $this->depth > $this->highestDepth ) { + $this->highestDepth = $this->depth; + } + if ( $this->depth >= $this->depthLimit ) { + return $this->getLimitationWarning( 'unstrip-depth', $this->depthLimit ); + } + + $value = $this->data[$type][$marker]; + if ( $value instanceof Closure ) { + $value = $value(); + } + + $this->expandSize += strlen( $value ); + if ( $this->expandSize > $this->sizeLimit ) { + return $this->getLimitationWarning( 'unstrip-size', $this->sizeLimit ); + } + + $this->circularRefGuard[$marker] = true; + $this->depth++; + $ret = $this->unstripType( $type, $value ); + $this->depth--; + unset( $this->circularRefGuard[$marker] ); + + return $ret; + } else { + return $m[0]; + } + }; + + $text = preg_replace_callback( $this->regex, $callback, $text ); + return $text; + } + + /** + * Get warning HTML and register a limitation warning with the parser + * + * @param string $type + * @param int $max + * @return string + */ + private function getLimitationWarning( $type, $max = '' ) { + if ( $this->parser ) { + $this->parser->limitationWarn( $type, $max ); + } + return $this->getWarning( "$type-warning", $max ); + } + + /** + * Get warning HTML + * + * @param string $message + * @param int $max + * @return string + */ + private function getWarning( $message, $max = '' ) { + return '<span class="error">' . + wfMessage( $message ) + ->numParams( $max )->inContentLanguage()->text() . + '</span>'; + } + + /** + * Get an array of parameters to pass to ParserOutput::setLimitReportData() + * + * @internal Should only be called by Parser + * @return array + */ + public function getLimitReport() { + return [ + [ 'limitreport-unstrip-depth', + [ + $this->highestDepth, + $this->depthLimit + ], + ], + [ 'limitreport-unstrip-size', + [ + $this->expandSize, + $this->sizeLimit + ], + ] + ]; + } + + /** + * Get a StripState object which is sufficient to unstrip the given text. + * It will contain the minimum subset of strip items necessary. + * + * @deprecated since 1.31 + * @param string $text + * @return StripState + */ + public function getSubState( $text ) { + wfDeprecated( __METHOD__, '1.31' ); + + $subState = new StripState; + $pos = 0; + while ( true ) { + $startPos = strpos( $text, Parser::MARKER_PREFIX, $pos ); + $endPos = strpos( $text, Parser::MARKER_SUFFIX, $pos ); + if ( $startPos === false || $endPos === false ) { + break; + } + + $endPos += strlen( Parser::MARKER_SUFFIX ); + $marker = substr( $text, $startPos, $endPos - $startPos ); + if ( !preg_match( $this->regex, $marker, $m ) ) { + continue; + } + + $key = $m[1]; + if ( isset( $this->data['nowiki'][$key] ) ) { + $subState->data['nowiki'][$key] = $this->data['nowiki'][$key]; + } elseif ( isset( $this->data['general'][$key] ) ) { + $subState->data['general'][$key] = $this->data['general'][$key]; + } + $pos = $endPos; + } + return $subState; + } + + /** + * Merge another StripState object into this one. The strip marker keys + * will not be preserved. The strings in the $texts array will have their + * strip markers rewritten, the resulting array of strings will be returned. + * + * @deprecated since 1.31 + * @param StripState $otherState + * @param array $texts + * @return array + */ + public function merge( $otherState, $texts ) { + wfDeprecated( __METHOD__, '1.31' ); + + $mergePrefix = wfRandomString( 16 ); + + foreach ( $otherState->data as $type => $items ) { + foreach ( $items as $key => $value ) { + $this->data[$type]["$mergePrefix-$key"] = $value; + } + } + + $callback = function ( $m ) use ( $mergePrefix ) { + $key = $m[1]; + return Parser::MARKER_PREFIX . $mergePrefix . '-' . $key . Parser::MARKER_SUFFIX; + }; + $texts = preg_replace_callback( $otherState->regex, $callback, $texts ); + return $texts; + } + + /** + * Remove any strip markers found in the given text. + * + * @param string $text + * @return string + */ + public function killMarkers( $text ) { + return preg_replace( $this->regex, '', $text ); + } +} |