summaryrefslogtreecommitdiff
path: root/www/wiki/includes/tidy/Balancer.php
diff options
context:
space:
mode:
Diffstat (limited to 'www/wiki/includes/tidy/Balancer.php')
-rw-r--r--www/wiki/includes/tidy/Balancer.php3584
1 files changed, 3584 insertions, 0 deletions
diff --git a/www/wiki/includes/tidy/Balancer.php b/www/wiki/includes/tidy/Balancer.php
new file mode 100644
index 00000000..6671f49b
--- /dev/null
+++ b/www/wiki/includes/tidy/Balancer.php
@@ -0,0 +1,3584 @@
+<?php
+/**
+ * An implementation of the tree building portion of the HTML5 parsing
+ * spec.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Parser
+ * @since 1.27
+ * @author C. Scott Ananian, 2016
+ */
+
+namespace MediaWiki\Tidy;
+
+use ExplodeIterator;
+use IteratorAggregate;
+use ReverseArrayIterator;
+use Sanitizer;
+use Wikimedia\Assert\Assert;
+use Wikimedia\Assert\ParameterAssertionException;
+
+// A note for future librarization[1] -- this file is a good candidate
+// for splitting into an independent library, except that it is currently
+// highly optimized for MediaWiki use. It only implements the portions
+// of the HTML5 tree builder used by tags supported by MediaWiki, and
+// does not contain a true tokenizer pass, instead relying on
+// comment stripping, attribute normalization, and escaping done by
+// the MediaWiki Sanitizer. It also deliberately avoids building
+// a true DOM in memory, instead serializing elements to an output string
+// as soon as possible (usually as soon as the tag is closed) to reduce
+// its memory footprint.
+
+// We've been gradually lifting some of these restrictions to handle
+// non-sanitized output generated by extensions, but we shortcut the tokenizer
+// for speed (primarily by splitting on `<`) and so rely on syntactic
+// well-formedness.
+
+// On the other hand, I've been pretty careful to note with comments in the
+// code the places where this implementation omits features of the spec or
+// depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
+// implement the missing pieces and make this a standalone PHP HTML5 parser.
+// In order to do so, some sort of MediaWiki-specific API will need
+// to be added to (a) allow the Balancer to bypass the tokenizer,
+// and (b) support on-the-fly flattening instead of DOM node creation.
+
+// [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
+
+/**
+ * Utility constants and sets for the HTML5 tree building algorithm.
+ * Sets are associative arrays indexed first by namespace and then by
+ * lower-cased tag name.
+ *
+ * @ingroup Parser
+ * @since 1.27
+ */
+class BalanceSets {
+ const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
+ const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
+ const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
+
+ public static $unsupportedSet = [
+ self::HTML_NAMESPACE => [
+ 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
+ 'frame' => true,
+ 'plaintext' => true,
+ 'xmp' => true, 'iframe' => true, 'noembed' => true,
+ 'noscript' => true, 'script' => true,
+ 'title' => true
+ ]
+ ];
+
+ public static $emptyElementSet = [
+ self::HTML_NAMESPACE => [
+ 'area' => true, 'base' => true, 'basefont' => true,
+ 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
+ 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
+ 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
+ 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
+ ]
+ ];
+
+ public static $extraLinefeedSet = [
+ self::HTML_NAMESPACE => [
+ 'pre' => true, 'textarea' => true, 'listing' => true,
+ ]
+ ];
+
+ public static $headingSet = [
+ self::HTML_NAMESPACE => [
+ 'h1' => true, 'h2' => true, 'h3' => true,
+ 'h4' => true, 'h5' => true, 'h6' => true
+ ]
+ ];
+
+ public static $specialSet = [
+ self::HTML_NAMESPACE => [
+ 'address' => true, 'applet' => true, 'area' => true,
+ 'article' => true, 'aside' => true, 'base' => true,
+ 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
+ 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
+ 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
+ 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
+ 'dt' => true, 'embed' => true, 'fieldset' => true,
+ 'figcaption' => true, 'figure' => true, 'footer' => true,
+ 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
+ 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
+ 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
+ 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
+ 'input' => true, 'li' => true, 'link' => true,
+ 'listing' => true, 'main' => true, 'marquee' => true,
+ 'menu' => true, 'meta' => true, 'nav' => true,
+ 'noembed' => true, 'noframes' => true, 'noscript' => true,
+ 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
+ 'plaintext' => true, 'pre' => true, 'script' => true,
+ 'section' => true, 'select' => true, 'source' => true,
+ 'style' => true, 'summary' => true, 'table' => true,
+ 'tbody' => true, 'td' => true, 'template' => true,
+ 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
+ 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
+ 'wbr' => true, 'xmp' => true
+ ],
+ self::SVG_NAMESPACE => [
+ 'foreignobject' => true, 'desc' => true, 'title' => true
+ ],
+ self::MATHML_NAMESPACE => [
+ 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
+ 'mtext' => true, 'annotation-xml' => true
+ ]
+ ];
+
+ public static $addressDivPSet = [
+ self::HTML_NAMESPACE => [
+ 'address' => true, 'div' => true, 'p' => true
+ ]
+ ];
+
+ public static $tableSectionRowSet = [
+ self::HTML_NAMESPACE => [
+ 'table' => true, 'thead' => true, 'tbody' => true,
+ 'tfoot' => true, 'tr' => true
+ ]
+ ];
+
+ public static $impliedEndTagsSet = [
+ self::HTML_NAMESPACE => [
+ 'dd' => true, 'dt' => true, 'li' => true,
+ 'menuitem' => true, 'optgroup' => true,
+ 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
+ 'rt' => true, 'rtc' => true
+ ]
+ ];
+
+ public static $thoroughImpliedEndTagsSet = [
+ self::HTML_NAMESPACE => [
+ 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
+ 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
+ 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
+ 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
+ 'thead' => true, 'tr' => true
+ ]
+ ];
+
+ public static $tableCellSet = [
+ self::HTML_NAMESPACE => [
+ 'td' => true, 'th' => true
+ ]
+ ];
+ public static $tableContextSet = [
+ self::HTML_NAMESPACE => [
+ 'table' => true, 'template' => true, 'html' => true
+ ]
+ ];
+
+ public static $tableBodyContextSet = [
+ self::HTML_NAMESPACE => [
+ 'tbody' => true, 'tfoot' => true, 'thead' => true,
+ 'template' => true, 'html' => true
+ ]
+ ];
+
+ public static $tableRowContextSet = [
+ self::HTML_NAMESPACE => [
+ 'tr' => true, 'template' => true, 'html' => true
+ ]
+ ];
+
+ // See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element
+ public static $formAssociatedSet = [
+ self::HTML_NAMESPACE => [
+ 'button' => true, 'fieldset' => true, 'input' => true,
+ 'keygen' => true, 'object' => true, 'output' => true,
+ 'select' => true, 'textarea' => true, 'img' => true
+ ]
+ ];
+
+ public static $inScopeSet = [
+ self::HTML_NAMESPACE => [
+ 'applet' => true, 'caption' => true, 'html' => true,
+ 'marquee' => true, 'object' => true,
+ 'table' => true, 'td' => true, 'template' => true,
+ 'th' => true
+ ],
+ self::SVG_NAMESPACE => [
+ 'foreignobject' => true, 'desc' => true, 'title' => true
+ ],
+ self::MATHML_NAMESPACE => [
+ 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
+ 'mtext' => true, 'annotation-xml' => true
+ ]
+ ];
+
+ private static $inListItemScopeSet = null;
+ public static function inListItemScopeSet() {
+ if ( self::$inListItemScopeSet === null ) {
+ self::$inListItemScopeSet = self::$inScopeSet;
+ self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
+ self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
+ }
+ return self::$inListItemScopeSet;
+ }
+
+ private static $inButtonScopeSet = null;
+ public static function inButtonScopeSet() {
+ if ( self::$inButtonScopeSet === null ) {
+ self::$inButtonScopeSet = self::$inScopeSet;
+ self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
+ }
+ return self::$inButtonScopeSet;
+ }
+
+ public static $inTableScopeSet = [
+ self::HTML_NAMESPACE => [
+ 'html' => true, 'table' => true, 'template' => true
+ ]
+ ];
+
+ public static $inInvertedSelectScopeSet = [
+ self::HTML_NAMESPACE => [
+ 'option' => true, 'optgroup' => true
+ ]
+ ];
+
+ public static $mathmlTextIntegrationPointSet = [
+ self::MATHML_NAMESPACE => [
+ 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
+ 'mtext' => true
+ ]
+ ];
+
+ public static $htmlIntegrationPointSet = [
+ self::SVG_NAMESPACE => [
+ 'foreignobject' => true,
+ 'desc' => true,
+ 'title' => true
+ ]
+ ];
+
+ // For tidy compatibility.
+ public static $tidyPWrapSet = [
+ self::HTML_NAMESPACE => [
+ 'body' => true, 'blockquote' => true,
+ // We parse with <body> as the fragment context, but the top-level
+ // element on the stack is actually <html>. We could use the
+ // "adjusted current node" everywhere to work around this, but it's
+ // easier just to add <html> to the p-wrap set.
+ 'html' => true,
+ ],
+ ];
+ public static $tidyInlineSet = [
+ self::HTML_NAMESPACE => [
+ 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
+ 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
+ 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
+ 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
+ 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
+ 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
+ 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
+ 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
+ 's' => true, 'samp' => true, 'select' => true, 'small' => true,
+ 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
+ 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
+ 'var' => true,
+ // Those defined in tidy.conf
+ 'video' => true, 'audio' => true, 'bdi' => true, 'data' => true,
+ 'time' => true, 'mark' => true,
+ ],
+ ];
+}
+
+/**
+ * A BalanceElement is a simplified version of a DOM Node. The main
+ * difference is that we only keep BalanceElements around for nodes
+ * currently on the BalanceStack of open elements. As soon as an
+ * element is closed, with some minor exceptions relating to the
+ * tree builder "adoption agency algorithm", the element and all its
+ * children are serialized to a string using the flatten() method.
+ * This keeps our memory usage low.
+ *
+ * @ingroup Parser
+ * @since 1.27
+ */
+class BalanceElement {
+ /**
+ * The namespace of the element.
+ * @var string $namespaceURI
+ */
+ public $namespaceURI;
+ /**
+ * The lower-cased name of the element.
+ * @var string $localName
+ */
+ public $localName;
+ /**
+ * Attributes for the element, in array form
+ * @var array $attribs
+ */
+ public $attribs;
+
+ /**
+ * Parent of this element, or the string "flat" if this element has
+ * already been flattened into its parent.
+ * @var BalanceElement|string|null $parent
+ */
+ public $parent;
+
+ /**
+ * An array of children of this element. Typically only the last
+ * child will be an actual BalanceElement object; the rest will
+ * be strings, representing either text nodes or flattened
+ * BalanceElement objects.
+ * @var BalanceElement[]|string[] $children
+ */
+ public $children;
+
+ /**
+ * A unique string identifier for Noah's Ark purposes, lazy initialized
+ */
+ private $noahKey;
+
+ /**
+ * The next active formatting element in the list, or null if this is the
+ * end of the AFE list or if the element is not in the AFE list.
+ */
+ public $nextAFE;
+
+ /**
+ * The previous active formatting element in the list, or null if this is
+ * the start of the list or if the element is not in the AFE list.
+ */
+ public $prevAFE;
+
+ /**
+ * The next element in the Noah's Ark species bucket.
+ */
+ public $nextNoah;
+
+ /**
+ * Make a new BalanceElement corresponding to the HTML DOM Element
+ * with the given localname, namespace, and attributes.
+ *
+ * @param string $namespaceURI The namespace of the element.
+ * @param string $localName The lowercased name of the tag.
+ * @param array $attribs Attributes of the element
+ */
+ public function __construct( $namespaceURI, $localName, array $attribs ) {
+ $this->localName = $localName;
+ $this->namespaceURI = $namespaceURI;
+ $this->attribs = $attribs;
+ $this->contents = '';
+ $this->parent = null;
+ $this->children = [];
+ }
+
+ /**
+ * Remove the given child from this element.
+ * @param BalanceElement $elt
+ */
+ private function removeChild( BalanceElement $elt ) {
+ Assert::precondition(
+ $this->parent !== 'flat', "Can't removeChild after flattening $this"
+ );
+ Assert::parameter(
+ $elt->parent === $this, 'elt', 'must have $this as a parent'
+ );
+ $idx = array_search( $elt, $this->children, true );
+ Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
+ $elt->parent = null;
+ array_splice( $this->children, $idx, 1 );
+ }
+
+ /**
+ * Find $a in the list of children and insert $b before it.
+ * @param BalanceElement $a
+ * @param BalanceElement|string $b
+ */
+ public function insertBefore( BalanceElement $a, $b ) {
+ Assert::precondition(
+ $this->parent !== 'flat', "Can't insertBefore after flattening."
+ );
+ $idx = array_search( $a, $this->children, true );
+ Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
+ if ( is_string( $b ) ) {
+ array_splice( $this->children, $idx, 0, [ $b ] );
+ } else {
+ Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
+ if ( $b->parent !== null ) {
+ $b->parent->removeChild( $b );
+ }
+ array_splice( $this->children, $idx, 0, [ $b ] );
+ $b->parent = $this;
+ }
+ }
+
+ /**
+ * Append $elt to the end of the list of children.
+ * @param BalanceElement|string $elt
+ */
+ public function appendChild( $elt ) {
+ Assert::precondition(
+ $this->parent !== 'flat', "Can't appendChild after flattening."
+ );
+ if ( is_string( $elt ) ) {
+ array_push( $this->children, $elt );
+ return;
+ }
+ // Remove $elt from parent, if it had one.
+ if ( $elt->parent !== null ) {
+ $elt->parent->removeChild( $elt );
+ }
+ array_push( $this->children, $elt );
+ $elt->parent = $this;
+ }
+
+ /**
+ * Transfer all of the children of $elt to $this.
+ * @param BalanceElement $elt
+ */
+ public function adoptChildren( BalanceElement $elt ) {
+ Assert::precondition(
+ $elt->parent !== 'flat', "Can't adoptChildren after flattening."
+ );
+ foreach ( $elt->children as $child ) {
+ if ( !is_string( $child ) ) {
+ // This is an optimization which avoids an O(n^2) set of
+ // array_splice operations.
+ $child->parent = null;
+ }
+ $this->appendChild( $child );
+ }
+ $elt->children = [];
+ }
+
+ /**
+ * Flatten this node and all of its children into a string, as specified
+ * by the HTML serialization specification, and replace this node
+ * in its parent by that string.
+ *
+ * @param array $config Balancer configuration; see Balancer::__construct().
+ * @return string
+ *
+ * @see __toString()
+ */
+ public function flatten( array $config ) {
+ Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
+ Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
+ $idx = array_search( $this, $this->parent->children, true );
+ Assert::parameter(
+ $idx !== false, '$this', 'must be a child of its parent'
+ );
+ $tidyCompat = $config['tidyCompat'];
+ if ( $tidyCompat ) {
+ $blank = true;
+ foreach ( $this->children as $elt ) {
+ if ( !is_string( $elt ) ) {
+ $elt = $elt->flatten( $config );
+ }
+ if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
+ $blank = false;
+ }
+ }
+ if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
+ $this->localName = 'p';
+ } elseif ( $blank ) {
+ // Add 'mw-empty-elt' class so elements can be hidden via CSS
+ // for compatibility with legacy tidy.
+ if ( !count( $this->attribs ) &&
+ ( $this->localName === 'tr' || $this->localName === 'li' )
+ ) {
+ $this->attribs = [ 'class' => "mw-empty-elt" ];
+ }
+ $blank = false;
+ } elseif (
+ $this->isA( BalanceSets::$extraLinefeedSet ) &&
+ count( $this->children ) > 0 &&
+ substr( $this->children[0], 0, 1 ) == "\n"
+ ) {
+ // Double the linefeed after pre/listing/textarea
+ // according to the (old) HTML5 fragment serialization
+ // algorithm (see https://github.com/whatwg/html/issues/944)
+ // to ensure this will round-trip.
+ array_unshift( $this->children, "\n" );
+ }
+ $flat = $blank ? '' : "{$this}";
+ } else {
+ $flat = "{$this}";
+ }
+ $this->parent->children[$idx] = $flat;
+ $this->parent = 'flat'; // for assertion checking
+ return $flat;
+ }
+
+ /**
+ * Serialize this node and all of its children to a string, as specified
+ * by the HTML serialization specification.
+ *
+ * @return string The serialization of the BalanceElement
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
+ */
+ public function __toString() {
+ $encAttribs = '';
+ foreach ( $this->attribs as $name => $value ) {
+ $encValue = Sanitizer::encodeAttribute( $value );
+ $encAttribs .= " $name=\"$encValue\"";
+ }
+ if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
+ $out = "<{$this->localName}{$encAttribs}>";
+ $len = strlen( $out );
+ // flatten children
+ foreach ( $this->children as $elt ) {
+ $out .= "{$elt}";
+ }
+ $out .= "</{$this->localName}>";
+ } else {
+ $out = "<{$this->localName}{$encAttribs} />";
+ Assert::invariant(
+ count( $this->children ) === 0,
+ "Empty elements shouldn't have children."
+ );
+ }
+ return $out;
+ }
+
+ // Utility functions on BalanceElements.
+
+ /**
+ * Determine if $this represents a specific HTML tag, is a member of
+ * a tag set, or is equal to another BalanceElement.
+ *
+ * @param BalanceElement|array|string $set The target BalanceElement,
+ * set (from the BalanceSets class), or string (HTML tag name).
+ * @return bool
+ */
+ public function isA( $set ) {
+ if ( $set instanceof BalanceElement ) {
+ return $this === $set;
+ } elseif ( is_array( $set ) ) {
+ return isset( $set[$this->namespaceURI] ) &&
+ isset( $set[$this->namespaceURI][$this->localName] );
+ } else {
+ // assume this is an HTML element name.
+ return $this->isHtml() && $this->localName === $set;
+ }
+ }
+
+ /**
+ * Determine if this element is an HTML element with the specified name
+ * @param string $tagName
+ * @return bool
+ */
+ public function isHtmlNamed( $tagName ) {
+ return $this->namespaceURI === BalanceSets::HTML_NAMESPACE
+ && $this->localName === $tagName;
+ }
+
+ /**
+ * Determine if $this represents an element in the HTML namespace.
+ *
+ * @return bool
+ */
+ public function isHtml() {
+ return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
+ }
+
+ /**
+ * Determine if $this represents a MathML text integration point,
+ * as defined in the HTML5 specification.
+ *
+ * @return bool
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
+ */
+ public function isMathmlTextIntegrationPoint() {
+ return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
+ }
+
+ /**
+ * Determine if $this represents an HTML integration point,
+ * as defined in the HTML5 specification.
+ *
+ * @return bool
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
+ */
+ public function isHtmlIntegrationPoint() {
+ if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
+ return true;
+ }
+ if (
+ $this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
+ $this->localName === 'annotation-xml' &&
+ isset( $this->attribs['encoding'] ) &&
+ ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
+ strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
+ ) {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Get a string key for the Noah's Ark algorithm
+ * @return string
+ */
+ public function getNoahKey() {
+ if ( $this->noahKey === null ) {
+ $attribs = $this->attribs;
+ ksort( $attribs );
+ $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
+ }
+ return $this->noahKey;
+ }
+}
+
+/**
+ * The "stack of open elements" as defined in the HTML5 tree builder
+ * spec. This contains methods to ensure that content (start tags, text)
+ * are inserted at the correct place in the output string, and to
+ * flatten BalanceElements are they are closed to avoid holding onto
+ * a complete DOM tree for the document in memory.
+ *
+ * The stack defines a PHP iterator to traverse it in "reverse order",
+ * that is, the most-recently-added element is visited first in a
+ * foreach loop.
+ *
+ * @ingroup Parser
+ * @since 1.27
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
+ */
+class BalanceStack implements IteratorAggregate {
+ /**
+ * Backing storage for the stack.
+ * @var BalanceElement[] $elements
+ */
+ private $elements = [];
+ /**
+ * Foster parent mode determines how nodes are inserted into the
+ * stack.
+ * @var bool $fosterParentMode
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
+ */
+ public $fosterParentMode = false;
+ /**
+ * Configuration options governing flattening.
+ * @var array $config
+ * @see Balancer::__construct()
+ */
+ private $config;
+ /**
+ * Reference to the current element
+ */
+ public $currentNode;
+
+ /**
+ * Create a new BalanceStack with a single BalanceElement on it,
+ * representing the root &lt;html&gt; node.
+ * @param array $config Balancer configuration; see Balancer::_construct().
+ */
+ public function __construct( array $config ) {
+ // always a root <html> element on the stack
+ array_push(
+ $this->elements,
+ new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
+ );
+ $this->currentNode = $this->elements[0];
+ $this->config = $config;
+ }
+
+ /**
+ * Return a string representing the output of the tree builder:
+ * all the children of the root &lt;html&gt; node.
+ * @return string
+ */
+ public function getOutput() {
+ // Don't include the outer '<html>....</html>'
+ $out = '';
+ foreach ( $this->elements[0]->children as $elt ) {
+ $out .= is_string( $elt ) ? $elt :
+ $elt->flatten( $this->config );
+ }
+ return $out;
+ }
+
+ /**
+ * Insert a comment at the appropriate place for inserting a node.
+ * @param string $value Content of the comment.
+ * @return string
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment
+ */
+ public function insertComment( $value ) {
+ // Just another type of text node, except for tidy p-wrapping.
+ return $this->insertText( '<!--' . $value . '-->', true );
+ }
+
+ /**
+ * Insert text at the appropriate place for inserting a node.
+ * @param string $value
+ * @param bool $isComment
+ * @return string
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
+ */
+ public function insertText( $value, $isComment = false ) {
+ if (
+ $this->fosterParentMode &&
+ $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
+ ) {
+ $this->fosterParent( $value );
+ } elseif (
+ $this->config['tidyCompat'] && !$isComment &&
+ $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
+ ) {
+ $this->insertHTMLElement( 'mw:p-wrap', [] );
+ return $this->insertText( $value );
+ } else {
+ $this->currentNode->appendChild( $value );
+ }
+ }
+
+ /**
+ * Insert a BalanceElement at the appropriate place, pushing it
+ * on to the open elements stack.
+ * @param string $namespaceURI The element namespace
+ * @param string $tag The tag name
+ * @param string $attribs Normalized attributes, as a string.
+ * @return BalanceElement
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
+ */
+ public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
+ return $this->insertElement(
+ new BalanceElement( $namespaceURI, $tag, $attribs )
+ );
+ }
+
+ /**
+ * Insert an HTML element at the appropriate place, pushing it on to
+ * the open elements stack.
+ * @param string $tag The tag name
+ * @param string $attribs Normalized attributes, as a string.
+ * @return BalanceElement
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
+ */
+ public function insertHTMLElement( $tag, $attribs ) {
+ return $this->insertForeignElement(
+ BalanceSets::HTML_NAMESPACE, $tag, $attribs
+ );
+ }
+
+ /**
+ * Insert an element at the appropriate place and push it on to the
+ * open elements stack.
+ * @param BalanceElement $elt
+ * @return BalanceElement
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
+ */
+ public function insertElement( BalanceElement $elt ) {
+ if (
+ $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) &&
+ !$elt->isA( BalanceSets::$tidyInlineSet )
+ ) {
+ // Tidy compatibility.
+ $this->pop();
+ }
+ if (
+ $this->fosterParentMode &&
+ $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
+ ) {
+ $elt = $this->fosterParent( $elt );
+ } else {
+ $this->currentNode->appendChild( $elt );
+ }
+ Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
+ Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
+ array_push( $this->elements, $elt );
+ $this->currentNode = $elt;
+ return $elt;
+ }
+
+ /**
+ * Determine if the stack has $tag in scope.
+ * @param BalanceElement|array|string $tag
+ * @return bool
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
+ */
+ public function inScope( $tag ) {
+ return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
+ }
+
+ /**
+ * Determine if the stack has $tag in button scope.
+ * @param BalanceElement|array|string $tag
+ * @return bool
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
+ */
+ public function inButtonScope( $tag ) {
+ return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
+ }
+
+ /**
+ * Determine if the stack has $tag in list item scope.
+ * @param BalanceElement|array|string $tag
+ * @return bool
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
+ */
+ public function inListItemScope( $tag ) {
+ return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
+ }
+
+ /**
+ * Determine if the stack has $tag in table scope.
+ * @param BalanceElement|array|string $tag
+ * @return bool
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
+ */
+ public function inTableScope( $tag ) {
+ return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
+ }
+
+ /**
+ * Determine if the stack has $tag in select scope.
+ * @param BalanceElement|array|string $tag
+ * @return bool
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope
+ */
+ public function inSelectScope( $tag ) {
+ // Can't use inSpecificScope to implement this, since it involves
+ // *inverting* a set of tags. Implement manually.
+ foreach ( $this as $elt ) {
+ if ( $elt->isA( $tag ) ) {
+ return true;
+ }
+ if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) {
+ return false;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Determine if the stack has $tag in a specific scope, $set.
+ * @param BalanceElement|array|string $tag
+ * @param BalanceElement|array|string $set
+ * @return bool
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
+ */
+ public function inSpecificScope( $tag, $set ) {
+ foreach ( $this as $elt ) {
+ if ( $elt->isA( $tag ) ) {
+ return true;
+ }
+ if ( $elt->isA( $set ) ) {
+ return false;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Generate implied end tags.
+ * @param string $butnot
+ * @param bool $thorough True if we should generate end tags thoroughly.
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
+ */
+ public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
+ $endTagSet = $thorough ?
+ BalanceSets::$thoroughImpliedEndTagsSet :
+ BalanceSets::$impliedEndTagsSet;
+ while ( $this->currentNode ) {
+ if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) {
+ break;
+ }
+ if ( !$this->currentNode->isA( $endTagSet ) ) {
+ break;
+ }
+ $this->pop();
+ }
+ }
+
+ /**
+ * Return the adjusted current node.
+ * @param string $fragmentContext
+ * @return string
+ */
+ public function adjustedCurrentNode( $fragmentContext ) {
+ return ( $fragmentContext && count( $this->elements ) === 1 ) ?
+ $fragmentContext : $this->currentNode;
+ }
+
+ /**
+ * Return an iterator over this stack which visits the current node
+ * first, and the root node last.
+ * @return \Iterator
+ */
+ public function getIterator() {
+ return new ReverseArrayIterator( $this->elements );
+ }
+
+ /**
+ * Return the BalanceElement at the given position $idx, where
+ * position 0 represents the root element.
+ * @param int $idx
+ * @return BalanceElement
+ */
+ public function node( $idx ) {
+ return $this->elements[ $idx ];
+ }
+
+ /**
+ * Replace the element at position $idx in the BalanceStack with $elt.
+ * @param int $idx
+ * @param BalanceElement $elt
+ */
+ public function replaceAt( $idx, BalanceElement $elt ) {
+ Assert::precondition(
+ $this->elements[$idx]->parent !== 'flat',
+ 'Replaced element should not have already been flattened.'
+ );
+ Assert::precondition(
+ $elt->parent !== 'flat',
+ 'New element should not have already been flattened.'
+ );
+ $this->elements[$idx] = $elt;
+ if ( $idx === count( $this->elements ) - 1 ) {
+ $this->currentNode = $elt;
+ }
+ }
+
+ /**
+ * Return the position of the given BalanceElement, set, or
+ * HTML tag name string in the BalanceStack.
+ * @param BalanceElement|array|string $tag
+ * @return int
+ */
+ public function indexOf( $tag ) {
+ for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
+ if ( $this->elements[$i]->isA( $tag ) ) {
+ return $i;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Return the number of elements currently in the BalanceStack.
+ * @return int
+ */
+ public function length() {
+ return count( $this->elements );
+ }
+
+ /**
+ * Remove the current node from the BalanceStack, flattening it
+ * in the process.
+ */
+ public function pop() {
+ $elt = array_pop( $this->elements );
+ if ( count( $this->elements ) ) {
+ $this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
+ } else {
+ $this->currentNode = null;
+ }
+ if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
+ $elt->flatten( $this->config );
+ }
+ }
+
+ /**
+ * Remove all nodes up to and including position $idx from the
+ * BalanceStack, flattening them in the process.
+ * @param int $idx
+ */
+ public function popTo( $idx ) {
+ for ( $length = count( $this->elements ); $length > $idx; $length-- ) {
+ $this->pop();
+ }
+ }
+
+ /**
+ * Pop elements off the stack up to and including the first
+ * element with the specified HTML tagname (or matching the given
+ * set).
+ * @param BalanceElement|array|string $tag
+ */
+ public function popTag( $tag ) {
+ while ( $this->currentNode ) {
+ if ( $this->currentNode->isA( $tag ) ) {
+ $this->pop();
+ break;
+ }
+ $this->pop();
+ }
+ }
+
+ /**
+ * Pop elements off the stack *not including* the first element
+ * in the specified set.
+ * @param BalanceElement|array|string $set
+ */
+ public function clearToContext( $set ) {
+ // Note that we don't loop to 0. Never pop the <html> elt off.
+ for ( $length = count( $this->elements ); $length > 1; $length-- ) {
+ if ( $this->currentNode->isA( $set ) ) {
+ break;
+ }
+ $this->pop();
+ }
+ }
+
+ /**
+ * Remove the given $elt from the BalanceStack, optionally
+ * flattening it in the process.
+ * @param BalanceElement $elt The element to remove.
+ * @param bool $flatten Whether to flatten the removed element.
+ */
+ public function removeElement( BalanceElement $elt, $flatten = true ) {
+ Assert::parameter(
+ $elt->parent !== 'flat',
+ '$elt',
+ '$elt should not already have been flattened.'
+ );
+ Assert::parameter(
+ $elt->parent->parent !== 'flat',
+ '$elt',
+ 'The parent of $elt should not already have been flattened.'
+ );
+ $idx = array_search( $elt, $this->elements, true );
+ Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
+ array_splice( $this->elements, $idx, 1 );
+ if ( $idx === count( $this->elements ) ) {
+ $this->currentNode = $this->elements[$idx - 1];
+ }
+ if ( $flatten ) {
+ // serialize $elt into its parent
+ // otherwise, it will eventually serialize when the parent
+ // is serialized, we just hold onto the memory for its
+ // tree of objects a little longer.
+ $elt->flatten( $this->config );
+ }
+ Assert::postcondition(
+ array_search( $elt, $this->elements, true ) === false,
+ '$elt should no longer be in open elements stack'
+ );
+ }
+
+ /**
+ * Find $a in the BalanceStack and insert $b after it.
+ * @param BalanceElement $a
+ * @param BalanceElement $b
+ */
+ public function insertAfter( BalanceElement $a, BalanceElement $b ) {
+ $idx = $this->indexOf( $a );
+ Assert::parameter( $idx !== false, '$a', 'must be in stack' );
+ if ( $idx === count( $this->elements ) - 1 ) {
+ array_push( $this->elements, $b );
+ $this->currentNode = $b;
+ } else {
+ array_splice( $this->elements, $idx + 1, 0, [ $b ] );
+ }
+ }
+
+ // Fostering and adoption.
+
+ /**
+ * Foster parent the given $elt in the stack of open elements.
+ * @param BalanceElement|string $elt
+ * @return BalanceElement|string
+ *
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
+ */
+ private function fosterParent( $elt ) {
+ $lastTable = $this->indexOf( 'table' );
+ $lastTemplate = $this->indexOf( 'template' );
+ $parent = null;
+ $before = null;
+
+ if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
+ $parent = $this->elements[$lastTemplate];
+ } elseif ( $lastTable >= 0 ) {
+ $parent = $this->elements[$lastTable]->parent;
+ // Assume all tables have parents, since we're not running scripts!
+ Assert::invariant(
+ $parent !== null, "All tables should have parents"
+ );
+ $before = $this->elements[$lastTable];
+ } else {
+ $parent = $this->elements[0]; // the `html` element.
+ }
+
+ if ( $this->config['tidyCompat'] ) {
+ if ( is_string( $elt ) ) {
+ // We're fostering text: do we need a p-wrapper?
+ if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
+ $this->insertHTMLElement( 'mw:p-wrap', [] );
+ $this->insertText( $elt );
+ return $elt;
+ }
+ } else {
+ // We're fostering an element; do we need to merge p-wrappers?
+ if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
+ $idx = $before ?
+ array_search( $before, $parent->children, true ) :
+ count( $parent->children );
+ $after = $idx > 0 ? $parent->children[$idx - 1] : '';
+ if (
+ $after instanceof BalanceElement &&
+ $after->isHtmlNamed( 'mw:p-wrap' )
+ ) {
+ return $after; // Re-use existing p-wrapper.
+ }
+ }
+ }
+ }
+
+ if ( $before ) {
+ $parent->insertBefore( $before, $elt );
+ } else {
+ $parent->appendChild( $elt );
+ }
+ return $elt;
+ }
+
+ /**
+ * Run the "adoption agency algoritm" (AAA) for the given subject
+ * tag name.
+ * @param string $tag The subject tag name.
+ * @param BalanceActiveFormattingElements $afe The current
+ * active formatting elements list.
+ * @return true if the adoption agency algorithm "did something", false
+ * if more processing is required by the caller.
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
+ */
+ public function adoptionAgency( $tag, $afe ) {
+ // If the current node is an HTML element whose tag name is subject,
+ // and the current node is not in the list of active formatting
+ // elements, then pop the current node off the stack of open
+ // elements and abort these steps.
+ if (
+ $this->currentNode->isHtmlNamed( $tag ) &&
+ !$afe->isInList( $this->currentNode )
+ ) {
+ $this->pop();
+ return true; // no more handling required
+ }
+
+ // Outer loop: If outer loop counter is greater than or
+ // equal to eight, then abort these steps.
+ for ( $outer = 0; $outer < 8; $outer++ ) {
+ // Let the formatting element be the last element in the list
+ // of active formatting elements that: is between the end of
+ // the list and the last scope marker in the list, if any, or
+ // the start of the list otherwise, and has the same tag name
+ // as the token.
+ $fmtElt = $afe->findElementByTag( $tag );
+
+ // If there is no such node, then abort these steps and instead
+ // act as described in the "any other end tag" entry below.
+ if ( !$fmtElt ) {
+ return false; // false means handle by the default case
+ }
+
+ // Otherwise, if there is such a node, but that node is not in
+ // the stack of open elements, then this is a parse error;
+ // remove the element from the list, and abort these steps.
+ $index = $this->indexOf( $fmtElt );
+ if ( $index < 0 ) {
+ $afe->remove( $fmtElt );
+ return true; // true means no more handling required
+ }
+
+ // Otherwise, if there is such a node, and that node is also in
+ // the stack of open elements, but the element is not in scope,
+ // then this is a parse error; ignore the token, and abort
+ // these steps.
+ if ( !$this->inScope( $fmtElt ) ) {
+ return true;
+ }
+
+ // Let the furthest block be the topmost node in the stack of
+ // open elements that is lower in the stack than the formatting
+ // element, and is an element in the special category. There
+ // might not be one.
+ $furthestBlock = null;
+ $furthestBlockIndex = -1;
+ $stackLength = $this->length();
+ for ( $i = $index + 1; $i < $stackLength; $i++ ) {
+ if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
+ $furthestBlock = $this->node( $i );
+ $furthestBlockIndex = $i;
+ break;
+ }
+ }
+
+ // If there is no furthest block, then the UA must skip the
+ // subsequent steps and instead just pop all the nodes from the
+ // bottom of the stack of open elements, from the current node
+ // up to and including the formatting element, and remove the
+ // formatting element from the list of active formatting
+ // elements.
+ if ( !$furthestBlock ) {
+ $this->popTag( $fmtElt );
+ $afe->remove( $fmtElt );
+ return true;
+ }
+
+ // Let the common ancestor be the element immediately above
+ // the formatting element in the stack of open elements.
+ $ancestor = $this->node( $index - 1 );
+
+ // Let a bookmark note the position of the formatting
+ // element in the list of active formatting elements
+ // relative to the elements on either side of it in the
+ // list.
+ $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
+ $afe->insertAfter( $fmtElt, $BOOKMARK );
+
+ // Let node and last node be the furthest block.
+ $node = $furthestBlock;
+ $lastNode = $furthestBlock;
+ $nodeIndex = $furthestBlockIndex;
+ $isAFE = false;
+
+ // Inner loop
+ for ( $inner = 1; true; $inner++ ) {
+ // Let node be the element immediately above node in
+ // the stack of open elements, or if node is no longer
+ // in the stack of open elements (e.g. because it got
+ // removed by this algorithm), the element that was
+ // immediately above node in the stack of open elements
+ // before node was removed.
+ $node = $this->node( --$nodeIndex );
+
+ // If node is the formatting element, then go
+ // to the next step in the overall algorithm.
+ if ( $node === $fmtElt ) break;
+
+ // If the inner loop counter is greater than three and node
+ // is in the list of active formatting elements, then remove
+ // node from the list of active formatting elements.
+ $isAFE = $afe->isInList( $node );
+ if ( $inner > 3 && $isAFE ) {
+ $afe->remove( $node );
+ $isAFE = false;
+ }
+
+ // If node is not in the list of active formatting
+ // elements, then remove node from the stack of open
+ // elements and then go back to the step labeled inner
+ // loop.
+ if ( !$isAFE ) {
+ // Don't flatten here, since we're about to relocate
+ // parts of this $node.
+ $this->removeElement( $node, false );
+ continue;
+ }
+
+ // Create an element for the token for which the
+ // element node was created with common ancestor as
+ // the intended parent, replace the entry for node
+ // in the list of active formatting elements with an
+ // entry for the new element, replace the entry for
+ // node in the stack of open elements with an entry for
+ // the new element, and let node be the new element.
+ $newElt = new BalanceElement(
+ $node->namespaceURI, $node->localName, $node->attribs );
+ $afe->replace( $node, $newElt );
+ $this->replaceAt( $nodeIndex, $newElt );
+ $node = $newElt;
+
+ // If last node is the furthest block, then move the
+ // aforementioned bookmark to be immediately after the
+ // new node in the list of active formatting elements.
+ if ( $lastNode === $furthestBlock ) {
+ $afe->remove( $BOOKMARK );
+ $afe->insertAfter( $newElt, $BOOKMARK );
+ }
+
+ // Insert last node into node, first removing it from
+ // its previous parent node if any.
+ $node->appendChild( $lastNode );
+
+ // Let last node be node.
+ $lastNode = $node;
+ }
+
+ // If the common ancestor node is a table, tbody, tfoot,
+ // thead, or tr element, then, foster parent whatever last
+ // node ended up being in the previous step, first removing
+ // it from its previous parent node if any.
+ if (
+ $this->fosterParentMode &&
+ $ancestor->isA( BalanceSets::$tableSectionRowSet )
+ ) {
+ $this->fosterParent( $lastNode );
+ } else {
+ // Otherwise, append whatever last node ended up being in
+ // the previous step to the common ancestor node, first
+ // removing it from its previous parent node if any.
+ $ancestor->appendChild( $lastNode );
+ }
+
+ // Create an element for the token for which the
+ // formatting element was created, with furthest block
+ // as the intended parent.
+ $newElt2 = new BalanceElement(
+ $fmtElt->namespaceURI, $fmtElt->localName, $fmtElt->attribs );
+
+ // Take all of the child nodes of the furthest block and
+ // append them to the element created in the last step.
+ $newElt2->adoptChildren( $furthestBlock );
+
+ // Append that new element to the furthest block.
+ $furthestBlock->appendChild( $newElt2 );
+
+ // Remove the formatting element from the list of active
+ // formatting elements, and insert the new element into the
+ // list of active formatting elements at the position of
+ // the aforementioned bookmark.
+ $afe->remove( $fmtElt );
+ $afe->replace( $BOOKMARK, $newElt2 );
+
+ // Remove the formatting element from the stack of open
+ // elements, and insert the new element into the stack of
+ // open elements immediately below the position of the
+ // furthest block in that stack.
+ $this->removeElement( $fmtElt );
+ $this->insertAfter( $furthestBlock, $newElt2 );
+ }
+
+ return true;
+ }
+
+ /**
+ * Return the contents of the open elements stack as a string for
+ * debugging.
+ * @return string
+ */
+ public function __toString() {
+ $r = [];
+ foreach ( $this->elements as $elt ) {
+ array_push( $r, $elt->localName );
+ }
+ return implode( ' ', $r );
+ }
+}
+
+/**
+ * A pseudo-element used as a marker in the list of active formatting elements
+ *
+ * @ingroup Parser
+ * @since 1.27
+ */
+class BalanceMarker {
+ public $nextAFE;
+ public $prevAFE;
+}
+
+/**
+ * The list of active formatting elements, which is used to handle
+ * mis-nested formatting element tags in the HTML5 tree builder
+ * specification.
+ *
+ * @ingroup Parser
+ * @since 1.27
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
+ */
+class BalanceActiveFormattingElements {
+ /** The last (most recent) element in the list */
+ private $tail;
+
+ /** The first (least recent) element in the list */
+ private $head;
+
+ /**
+ * An array of arrays representing the population of elements in each bucket
+ * according to the Noah's Ark clause. The outer array is stack-like, with each
+ * integer-indexed element representing a segment of the list, bounded by
+ * markers. The first element represents the segment of the list before the
+ * first marker.
+ *
+ * The inner arrays are indexed by "Noah key", which is a string which uniquely
+ * identifies each bucket according to the rules in the spec. The value in
+ * the inner array is the first (least recently inserted) element in the bucket,
+ * and subsequent members of the bucket can be found by iterating through the
+ * singly-linked list via $node->nextNoah.
+ *
+ * This is optimised for the most common case of inserting into a bucket
+ * with zero members, and deleting a bucket containing one member. In the
+ * worst case, iteration through the list is still O(1) in the document
+ * size, since each bucket can have at most 3 members.
+ */
+ private $noahTableStack = [ [] ];
+
+ public function __destruct() {
+ $next = null;
+ for ( $node = $this->head; $node; $node = $next ) {
+ $next = $node->nextAFE;
+ $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
+ }
+ $this->head = $this->tail = $this->noahTableStack = null;
+ }
+
+ public function insertMarker() {
+ $elt = new BalanceMarker;
+ if ( $this->tail ) {
+ $this->tail->nextAFE = $elt;
+ $elt->prevAFE = $this->tail;
+ } else {
+ $this->head = $elt;
+ }
+ $this->tail = $elt;
+ $this->noahTableStack[] = [];
+ }
+
+ /**
+ * Follow the steps required when the spec requires us to "push onto the
+ * list of active formatting elements".
+ * @param BalanceElement $elt
+ */
+ public function push( BalanceElement $elt ) {
+ // Must not be in the list already
+ if ( $elt->prevAFE !== null || $this->head === $elt ) {
+ throw new ParameterAssertionException( '$elt',
+ 'Cannot insert a node into the AFE list twice' );
+ }
+
+ // "Noah's Ark clause" -- if there are already three copies of
+ // this element before we encounter a marker, then drop the last
+ // one.
+ $noahKey = $elt->getNoahKey();
+ $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
+ if ( !isset( $table[$noahKey] ) ) {
+ $table[$noahKey] = $elt;
+ } else {
+ $count = 1;
+ $head = $tail = $table[$noahKey];
+ while ( $tail->nextNoah ) {
+ $tail = $tail->nextNoah;
+ $count++;
+ }
+ if ( $count >= 3 ) {
+ $this->remove( $head );
+ }
+ $tail->nextNoah = $elt;
+ }
+ // Add to the main AFE list
+ if ( $this->tail ) {
+ $this->tail->nextAFE = $elt;
+ $elt->prevAFE = $this->tail;
+ } else {
+ $this->head = $elt;
+ }
+ $this->tail = $elt;
+ }
+
+ /**
+ * Follow the steps required when the spec asks us to "clear the list of
+ * active formatting elements up to the last marker".
+ */
+ public function clearToMarker() {
+ // Iterate back through the list starting from the tail
+ $tail = $this->tail;
+ while ( $tail && !( $tail instanceof BalanceMarker ) ) {
+ // Unlink the element
+ $prev = $tail->prevAFE;
+ $tail->prevAFE = null;
+ if ( $prev ) {
+ $prev->nextAFE = null;
+ }
+ $tail->nextNoah = null;
+ $tail = $prev;
+ }
+ // If we finished on a marker, unlink it and pop it off the Noah table stack
+ if ( $tail ) {
+ $prev = $tail->prevAFE;
+ if ( $prev ) {
+ $prev->nextAFE = null;
+ }
+ $tail = $prev;
+ array_pop( $this->noahTableStack );
+ } else {
+ // No marker: wipe the top-level Noah table (which is the only one)
+ $this->noahTableStack[0] = [];
+ }
+ // If we removed all the elements, clear the head pointer
+ if ( !$tail ) {
+ $this->head = null;
+ }
+ $this->tail = $tail;
+ }
+
+ /**
+ * Find and return the last element with the specified tag between the
+ * end of the list and the last marker on the list.
+ * Used when parsing &lt;a&gt; "in body mode".
+ * @param string $tag
+ * @return null|Node
+ */
+ public function findElementByTag( $tag ) {
+ $elt = $this->tail;
+ while ( $elt && !( $elt instanceof BalanceMarker ) ) {
+ if ( $elt->localName === $tag ) {
+ return $elt;
+ }
+ $elt = $elt->prevAFE;
+ }
+ return null;
+ }
+
+ /**
+ * Determine whether an element is in the list of formatting elements.
+ * @param BalanceElement $elt
+ * @return bool
+ */
+ public function isInList( BalanceElement $elt ) {
+ return $this->head === $elt || $elt->prevAFE;
+ }
+
+ /**
+ * Find the element $elt in the list and remove it.
+ * Used when parsing &lt;a&gt; in body mode.
+ *
+ * @param BalanceElement $elt
+ */
+ public function remove( BalanceElement $elt ) {
+ if ( $this->head !== $elt && !$elt->prevAFE ) {
+ throw new ParameterAssertionException( '$elt',
+ "Attempted to remove an element which is not in the AFE list" );
+ }
+ // Update head and tail pointers
+ if ( $this->head === $elt ) {
+ $this->head = $elt->nextAFE;
+ }
+ if ( $this->tail === $elt ) {
+ $this->tail = $elt->prevAFE;
+ }
+ // Update previous element
+ if ( $elt->prevAFE ) {
+ $elt->prevAFE->nextAFE = $elt->nextAFE;
+ }
+ // Update next element
+ if ( $elt->nextAFE ) {
+ $elt->nextAFE->prevAFE = $elt->prevAFE;
+ }
+ // Clear pointers so that isInList() etc. will work
+ $elt->prevAFE = $elt->nextAFE = null;
+ // Update Noah list
+ $this->removeFromNoahList( $elt );
+ }
+
+ private function addToNoahList( BalanceElement $elt ) {
+ $noahKey = $elt->getNoahKey();
+ $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
+ if ( !isset( $table[$noahKey] ) ) {
+ $table[$noahKey] = $elt;
+ } else {
+ $tail = $table[$noahKey];
+ while ( $tail->nextNoah ) {
+ $tail = $tail->nextNoah;
+ }
+ $tail->nextNoah = $elt;
+ }
+ }
+
+ private function removeFromNoahList( BalanceElement $elt ) {
+ $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
+ $key = $elt->getNoahKey();
+ $noahElt = $table[$key];
+ if ( $noahElt === $elt ) {
+ if ( $noahElt->nextNoah ) {
+ $table[$key] = $noahElt->nextNoah;
+ $noahElt->nextNoah = null;
+ } else {
+ unset( $table[$key] );
+ }
+ } else {
+ do {
+ $prevNoahElt = $noahElt;
+ $noahElt = $prevNoahElt->nextNoah;
+ if ( $noahElt === $elt ) {
+ // Found it, unlink
+ $prevNoahElt->nextNoah = $elt->nextNoah;
+ $elt->nextNoah = null;
+ break;
+ }
+ } while ( $noahElt );
+ }
+ }
+
+ /**
+ * Find element $a in the list and replace it with element $b
+ *
+ * @param BalanceElement $a
+ * @param BalanceElement $b
+ */
+ public function replace( BalanceElement $a, BalanceElement $b ) {
+ if ( $this->head !== $a && !$a->prevAFE ) {
+ throw new ParameterAssertionException( '$a',
+ "Attempted to replace an element which is not in the AFE list" );
+ }
+ // Update head and tail pointers
+ if ( $this->head === $a ) {
+ $this->head = $b;
+ }
+ if ( $this->tail === $a ) {
+ $this->tail = $b;
+ }
+ // Update previous element
+ if ( $a->prevAFE ) {
+ $a->prevAFE->nextAFE = $b;
+ }
+ // Update next element
+ if ( $a->nextAFE ) {
+ $a->nextAFE->prevAFE = $b;
+ }
+ $b->prevAFE = $a->prevAFE;
+ $b->nextAFE = $a->nextAFE;
+ $a->nextAFE = $a->prevAFE = null;
+ // Update Noah list
+ $this->removeFromNoahList( $a );
+ $this->addToNoahList( $b );
+ }
+
+ /**
+ * Find $a in the list and insert $b after it.
+
+ * @param BalanceElement $a
+ * @param BalanceElement $b
+ */
+ public function insertAfter( BalanceElement $a, BalanceElement $b ) {
+ if ( $this->head !== $a && !$a->prevAFE ) {
+ throw new ParameterAssertionException( '$a',
+ "Attempted to insert after an element which is not in the AFE list" );
+ }
+ if ( $this->tail === $a ) {
+ $this->tail = $b;
+ }
+ if ( $a->nextAFE ) {
+ $a->nextAFE->prevAFE = $b;
+ }
+ $b->nextAFE = $a->nextAFE;
+ $b->prevAFE = $a;
+ $a->nextAFE = $b;
+ $this->addToNoahList( $b );
+ }
+
+ /**
+ * Reconstruct the active formatting elements.
+ * @param BalanceStack $stack The open elements stack
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
+ */
+ public function reconstruct( $stack ) {
+ $entry = $this->tail;
+ // If there are no entries in the list of active formatting elements,
+ // then there is nothing to reconstruct
+ if ( !$entry ) {
+ return;
+ }
+ // If the last is a marker, do nothing.
+ if ( $entry instanceof BalanceMarker ) {
+ return;
+ }
+ // Or if it is an open element, do nothing.
+ if ( $stack->indexOf( $entry ) >= 0 ) {
+ return;
+ }
+
+ // Loop backward through the list until we find a marker or an
+ // open element
+ $foundIt = false;
+ while ( $entry->prevAFE ) {
+ $entry = $entry->prevAFE;
+ if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
+ $foundIt = true;
+ break;
+ }
+ }
+
+ // Now loop forward, starting from the element after the current one (or
+ // the first element if we didn't find a marker or open element),
+ // recreating formatting elements and pushing them back onto the list
+ // of open elements.
+ if ( $foundIt ) {
+ $entry = $entry->nextAFE;
+ }
+ do {
+ $newElement = $stack->insertHTMLElement(
+ $entry->localName,
+ $entry->attribs );
+ $this->replace( $entry, $newElement );
+ $entry = $newElement->nextAFE;
+ } while ( $entry );
+ }
+
+ /**
+ * Get a string representation of the AFE list, for debugging
+ */
+ public function __toString() {
+ $prev = null;
+ $s = '';
+ for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
+ if ( $node instanceof BalanceMarker ) {
+ $s .= "MARKER\n";
+ continue;
+ }
+ $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
+ if ( $node->nextNoah ) {
+ $s .= " (noah sibling: {$node->nextNoah->localName}#" .
+ substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
+ ')';
+ }
+ if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
+ $s .= " (reverse link is wrong!)";
+ }
+ $s .= "\n";
+ }
+ if ( $prev !== $this->tail ) {
+ $s .= "(tail pointer is wrong!)\n";
+ }
+ return $s;
+ }
+}
+
+/**
+ * An implementation of the tree building portion of the HTML5 parsing
+ * spec.
+ *
+ * This is used to balance and tidy output so that the result can
+ * always be cleanly serialized/deserialized by an HTML5 parser. It
+ * does *not* guarantee "conforming" output -- the HTML5 spec contains
+ * a number of constraints which are not enforced by the HTML5 parsing
+ * process. But the result will be free of gross errors: misnested or
+ * unclosed tags, for example, and will be unchanged by spec-complient
+ * parsing followed by serialization.
+ *
+ * The tree building stage is structured as a state machine.
+ * When comparing the implementation to
+ * https://www.w3.org/TR/html5/syntax.html#tree-construction
+ * note that each state is implemented as a function with a
+ * name ending in `Mode` (because the HTML spec refers to them
+ * as insertion modes). The current insertion mode is held by
+ * the $parseMode property.
+ *
+ * The following simplifications have been made:
+ * - We handle body content only (ie, we start `in body`.)
+ * - The document is never in "quirks mode".
+ * - All occurrences of < and > have been entity escaped, so we
+ * can parse tags by simply splitting on those two characters.
+ * (This also simplifies the handling of < inside <textarea>.)
+ * The character < must not appear inside comments.
+ * Similarly, all attributes have been "cleaned" and are double-quoted
+ * and escaped.
+ * - All null characters are assumed to have been removed.
+ * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
+ * <frame>, <plaintext>, <xmp>, <iframe>,
+ * <noembed>, <noscript>, <script>, <title>. As a result,
+ * further simplifications can be made:
+ * - `frameset-ok` is not tracked.
+ * - `head element pointer` is not tracked (but presumed non-null)
+ * - Tokenizer has only a single mode. (<textarea> wants RCDATA and
+ * <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
+ *
+ * We generally mark places where we omit cases from the spec due to
+ * disallowed elements with a comment: `// OMITTED: <element-name>`.
+ *
+ * The HTML spec keeps a flag during the parsing process to track
+ * whether or not a "parse error" has been encountered. We don't
+ * bother to track that flag, we just implement the error-handling
+ * process as specified.
+ *
+ * @ingroup Parser
+ * @since 1.27
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
+ */
+class Balancer {
+ private $parseMode;
+ /** @var \Iterator */
+ private $bitsIterator;
+ private $allowedHtmlElements;
+ /** @var BalanceActiveFormattingElements */
+ private $afe;
+ /** @var BalanceStack */
+ private $stack;
+ private $strict;
+ private $allowComments;
+ private $config;
+
+ private $textIntegrationMode;
+ private $pendingTableText;
+ private $originalInsertionMode;
+ private $fragmentContext;
+ private $formElementPointer;
+ private $ignoreLinefeed;
+ private $inRCDATA;
+ private $inRAWTEXT;
+
+ /** @var callable|null */
+ private $processingCallback;
+ /** @var array */
+ private $processingArgs;
+
+ /**
+ * Valid HTML5 comments.
+ * Regex borrowed from Tim Starling's "remex-html" project.
+ */
+ const VALID_COMMENT_REGEX = "~ !--
+ ( # 1. Comment match detector
+ > | -> | # Invalid short close
+ ( # 2. Comment contents
+ (?:
+ (?! --> )
+ (?! --!> )
+ (?! --! \z )
+ (?! -- \z )
+ (?! - \z )
+ .
+ )*+
+ )
+ ( # 3. Comment close
+ --> | # Normal close
+ --!> | # Comment end bang
+ ( # 4. Indicate matches requiring EOF
+ --! | # EOF in comment end bang state
+ -- | # EOF in comment end state
+ - | # EOF in comment end dash state
+ (?#nothing) # EOF in comment state
+ )
+ )
+ )
+ ([^<]*) \z # 5. Non-tag text after the comment
+ ~xs";
+
+ /**
+ * Create a new Balancer.
+ * @param array $config Balancer configuration. Includes:
+ * 'strict' : boolean, defaults to false.
+ * When true, enforces syntactic constraints on input:
+ * all non-tag '<' must be escaped, all attributes must be
+ * separated by a single space and double-quoted. This is
+ * consistent with the output of the Sanitizer.
+ * 'allowedHtmlElements' : array, defaults to null.
+ * When present, the keys of this associative array give
+ * the acceptable HTML tag names. When not present, no
+ * tag sanitization is done.
+ * 'tidyCompat' : boolean, defaults to false.
+ * When true, the serialization algorithm is tweaked to
+ * provide historical compatibility with the old "tidy"
+ * program: <p>-wrapping is done to the children of
+ * <body> and <blockquote> elements, and empty elements
+ * are removed. The <pre>/<listing>/<textarea> serialization
+ * is also tweaked to allow lossless round trips.
+ * (See: https://github.com/whatwg/html/issues/944)
+ * 'allowComments': boolean, defaults to true.
+ * When true, allows HTML comments in the input.
+ * The Sanitizer generally strips all comments, so if you
+ * are running on sanitized output you can set this to
+ * false to get a bit more performance.
+ */
+ public function __construct( array $config = [] ) {
+ $this->config = $config = $config + [
+ 'strict' => false,
+ 'allowedHtmlElements' => null,
+ 'tidyCompat' => false,
+ 'allowComments' => true,
+ ];
+ $this->allowedHtmlElements = $config['allowedHtmlElements'];
+ $this->strict = $config['strict'];
+ $this->allowComments = $config['allowComments'];
+ if ( $this->allowedHtmlElements !== null ) {
+ // Sanity check!
+ $bad = array_uintersect_assoc(
+ $this->allowedHtmlElements,
+ BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
+ function ( $a, $b ) {
+ // Ignore the values (just intersect the keys) by saying
+ // all values are equal to each other.
+ return 0;
+ }
+ );
+ if ( count( $bad ) > 0 ) {
+ $badstr = implode( ',', array_keys( $bad ) );
+ throw new ParameterAssertionException(
+ '$config',
+ 'Balance attempted with sanitization including ' .
+ "unsupported elements: {$badstr}"
+ );
+ }
+ }
+ }
+
+ /**
+ * Return a balanced HTML string for the HTML fragment given by $text,
+ * subject to the caveats listed in the class description. The result
+ * will typically be idempotent -- that is, rebalancing the output
+ * would result in no change.
+ *
+ * @param string $text The markup to be balanced
+ * @param callable $processingCallback Callback to do any variable or
+ * parameter replacements in HTML attributes values
+ * @param array|bool $processingArgs Arguments for the processing callback
+ * @return string The balanced markup
+ */
+ public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
+ $this->parseMode = 'inBodyMode';
+ $this->bitsIterator = new ExplodeIterator( '<', $text );
+ $this->afe = new BalanceActiveFormattingElements();
+ $this->stack = new BalanceStack( $this->config );
+ $this->processingCallback = $processingCallback;
+ $this->processingArgs = $processingArgs;
+
+ $this->textIntegrationMode =
+ $this->ignoreLinefeed =
+ $this->inRCDATA =
+ $this->inRAWTEXT = false;
+
+ // The stack is constructed with an <html> element already on it.
+ // Set this up as a fragment parsed with <body> as the context.
+ $this->fragmentContext =
+ new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
+ $this->resetInsertionMode();
+ $this->formElementPointer = null;
+ for ( $e = $this->fragmentContext; $e != null; $e = $e->parent ) {
+ if ( $e->isHtmlNamed( 'form' ) ) {
+ $this->formElementPointer = $e;
+ break;
+ }
+ }
+
+ // First element is text not tag
+ $x = $this->bitsIterator->current();
+ $this->bitsIterator->next();
+ $this->insertToken( 'text', str_replace( '>', '&gt;', $x ) );
+ // Now process each tag.
+ while ( $this->bitsIterator->valid() ) {
+ $this->advance();
+ }
+ $this->insertToken( 'eof', null );
+ $result = $this->stack->getOutput();
+ // Free memory before returning.
+ $this->bitsIterator = null;
+ $this->afe = null;
+ $this->stack = null;
+ $this->fragmentContext = null;
+ $this->formElementPointer = null;
+ return $result;
+ }
+
+ /**
+ * Pass a token to the tree builder. The $token will be one of the
+ * strings "tag", "endtag", or "text".
+ */
+ private function insertToken( $token, $value, $attribs = null, $selfClose = false ) {
+ // validate tags against $unsupportedSet
+ if ( $token === 'tag' || $token === 'endtag' ) {
+ if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
+ // As described in "simplifications" above, these tags are
+ // not supported in the balancer.
+ Assert::invariant(
+ !$this->strict,
+ "Unsupported $token <$value> found."
+ );
+ return false;
+ }
+ } elseif ( $token === 'text' && $value === '' ) {
+ // Don't actually inject the empty string as a text token.
+ return true;
+ }
+ // Support pre/listing/textarea by suppressing initial linefeed
+ if ( $this->ignoreLinefeed ) {
+ $this->ignoreLinefeed = false;
+ if ( $token === 'text' ) {
+ if ( $value[0] === "\n" ) {
+ if ( $value === "\n" ) {
+ // Nothing would be left, don't inject the empty string.
+ return true;
+ }
+ $value = substr( $value, 1 );
+ }
+ }
+ }
+ // Some hoops we have to jump through
+ $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
+
+ // The spec calls this the "tree construction dispatcher".
+ $isForeign = true;
+ if (
+ $this->stack->length() === 0 ||
+ $adjusted->isHtml() ||
+ $token === 'eof'
+ ) {
+ $isForeign = false;
+ } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
+ if ( $token === 'text' ) {
+ $isForeign = false;
+ } elseif (
+ $token === 'tag' &&
+ $value !== 'mglyph' && $value !== 'malignmark'
+ ) {
+ $isForeign = false;
+ }
+ } elseif (
+ $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
+ $adjusted->localName === 'annotation-xml' &&
+ $token === 'tag' && $value === 'svg'
+ ) {
+ $isForeign = false;
+ } elseif (
+ $adjusted->isHtmlIntegrationPoint() &&
+ ( $token === 'tag' || $token === 'text' )
+ ) {
+ $isForeign = false;
+ }
+ if ( $isForeign ) {
+ return $this->insertForeignToken( $token, $value, $attribs, $selfClose );
+ } else {
+ $func = $this->parseMode;
+ return $this->$func( $token, $value, $attribs, $selfClose );
+ }
+ }
+
+ private function insertForeignToken( $token, $value, $attribs = null, $selfClose = false ) {
+ if ( $token === 'text' ) {
+ $this->stack->insertText( $value );
+ return true;
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
+ } elseif ( $token === 'tag' ) {
+ switch ( $value ) {
+ case 'font':
+ if ( isset( $attribs['color'] )
+ || isset( $attribs['face'] )
+ || isset( $attribs['size'] )
+ ) {
+ break;
+ }
+ // otherwise, fall through
+ case 'b':
+ case 'big':
+ case 'blockquote':
+ case 'body':
+ case 'br':
+ case 'center':
+ case 'code':
+ case 'dd':
+ case 'div':
+ case 'dl':
+ case 'dt':
+ case 'em':
+ case 'embed':
+ case 'h1':
+ case 'h2':
+ case 'h3':
+ case 'h4':
+ case 'h5':
+ case 'h6':
+ case 'head':
+ case 'hr':
+ case 'i':
+ case 'img':
+ case 'li':
+ case 'listing':
+ case 'menu':
+ case 'meta':
+ case 'nobr':
+ case 'ol':
+ case 'p':
+ case 'pre':
+ case 'ruby':
+ case 's':
+ case 'small':
+ case 'span':
+ case 'strong':
+ case 'strike':
+ case 'sub':
+ case 'sup':
+ case 'table':
+ case 'tt':
+ case 'u':
+ case 'ul':
+ case 'var':
+ if ( $this->fragmentContext ) {
+ break;
+ }
+ while ( true ) {
+ $this->stack->pop();
+ $node = $this->stack->currentNode;
+ if (
+ $node->isMathmlTextIntegrationPoint() ||
+ $node->isHtmlIntegrationPoint() ||
+ $node->isHtml()
+ ) {
+ break;
+ }
+ }
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ // "Any other start tag"
+ $adjusted = ( $this->fragmentContext && $this->stack->length() === 1 ) ?
+ $this->fragmentContext : $this->stack->currentNode;
+ $this->stack->insertForeignElement(
+ $adjusted->namespaceURI, $value, $attribs
+ );
+ if ( $selfClose ) {
+ $this->stack->pop();
+ }
+ return true;
+ } elseif ( $token === 'endtag' ) {
+ $first = true;
+ foreach ( $this->stack as $i => $node ) {
+ if ( $node->isHtml() && !$first ) {
+ // process the end tag as HTML
+ $func = $this->parseMode;
+ return $this->$func( $token, $value, $attribs, $selfClose );
+ } elseif ( $i === 0 ) {
+ return true;
+ } elseif ( $node->localName === $value ) {
+ $this->stack->popTag( $node );
+ return true;
+ }
+ $first = false;
+ }
+ }
+ }
+
+ /**
+ * Grab the next "token" from $bitsIterator. This is either a open/close
+ * tag or text or a comment, depending on whether the Sanitizer approves.
+ */
+ private function advance() {
+ $x = $this->bitsIterator->current();
+ $this->bitsIterator->next();
+ $regs = [];
+ // Handle comments. These won't be generated by mediawiki (they
+ // are stripped in the Sanitizer) but may be generated by extensions.
+ if (
+ $this->allowComments &&
+ !( $this->inRCDATA || $this->inRAWTEXT ) &&
+ preg_match( self::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
+ // verify EOF condition where necessary
+ ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
+ ) {
+ $contents = $regs[2][0];
+ $rest = $regs[5][0];
+ $this->insertToken( 'comment', $contents );
+ $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
+ return;
+ }
+ // $slash: Does the current element start with a '/'?
+ // $t: Current element name
+ // $attribStr: String between element name and >
+ // $brace: Ending '>' or '/>'
+ // $rest: Everything until the next element from the $bitsIterator
+ if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
+ list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
+ $t = strtolower( $t );
+ if ( $this->strict ) {
+ // Verify that attributes are all properly double-quoted
+ Assert::invariant(
+ preg_match(
+ '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
+ ),
+ "Bad attribute string found"
+ );
+ }
+ } else {
+ Assert::invariant(
+ !$this->strict, "< found which does not start a valid tag"
+ );
+ $slash = $t = $attribStr = $brace = $rest = null;
+ }
+ $goodTag = $t;
+ if ( $this->inRCDATA ) {
+ if ( $slash && $t === $this->inRCDATA ) {
+ $this->inRCDATA = false;
+ } else {
+ // No tags allowed; this emulates the "rcdata" tokenizer mode.
+ $goodTag = false;
+ }
+ }
+ if ( $this->inRAWTEXT ) {
+ if ( $slash && $t === $this->inRAWTEXT ) {
+ $this->inRAWTEXT = false;
+ } else {
+ // No tags allowed, no entity-escaping done.
+ $goodTag = false;
+ }
+ }
+ $sanitize = $this->allowedHtmlElements !== null;
+ if ( $sanitize ) {
+ $goodTag = $t && isset( $this->allowedHtmlElements[$t] );
+ }
+ if ( $goodTag ) {
+ if ( is_callable( $this->processingCallback ) ) {
+ call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
+ }
+ if ( $sanitize ) {
+ $goodTag = Sanitizer::validateTag( $attribStr, $t );
+ }
+ }
+ if ( $goodTag ) {
+ if ( $sanitize ) {
+ $attribs = Sanitizer::decodeTagAttributes( $attribStr );
+ $attribs = Sanitizer::validateTagAttributes( $attribs, $t );
+ } else {
+ $attribs = Sanitizer::decodeTagAttributes( $attribStr );
+ }
+ $goodTag = $this->insertToken(
+ $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
+ );
+ }
+ if ( $goodTag ) {
+ $rest = str_replace( '>', '&gt;', $rest );
+ $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
+ } elseif ( $this->inRAWTEXT ) {
+ $this->insertToken( 'text', "<$x" );
+ } else {
+ // bad tag; serialize entire thing as text.
+ $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
+ }
+ }
+
+ private function switchMode( $mode ) {
+ Assert::parameter(
+ substr( $mode, -4 ) === 'Mode', '$mode', 'should end in Mode'
+ );
+ $oldMode = $this->parseMode;
+ $this->parseMode = $mode;
+ return $oldMode;
+ }
+
+ private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfClose ) {
+ $this->switchMode( $mode );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+
+ private function resetInsertionMode() {
+ $last = false;
+ foreach ( $this->stack as $i => $node ) {
+ if ( $i === 0 ) {
+ $last = true;
+ if ( $this->fragmentContext ) {
+ $node = $this->fragmentContext;
+ }
+ }
+ if ( $node->isHtml() ) {
+ switch ( $node->localName ) {
+ case 'select':
+ $stackLength = $this->stack->length();
+ for ( $j = $i + 1; $j < $stackLength - 1; $j++ ) {
+ $ancestor = $this->stack->node( $stackLength - $j - 1 );
+ if ( $ancestor->isHtmlNamed( 'template' ) ) {
+ break;
+ }
+ if ( $ancestor->isHtmlNamed( 'table' ) ) {
+ $this->switchMode( 'inSelectInTableMode' );
+ return;
+ }
+ }
+ $this->switchMode( 'inSelectMode' );
+ return;
+ case 'tr':
+ $this->switchMode( 'inRowMode' );
+ return;
+ case 'tbody':
+ case 'tfoot':
+ case 'thead':
+ $this->switchMode( 'inTableBodyMode' );
+ return;
+ case 'caption':
+ $this->switchMode( 'inCaptionMode' );
+ return;
+ case 'colgroup':
+ $this->switchMode( 'inColumnGroupMode' );
+ return;
+ case 'table':
+ $this->switchMode( 'inTableMode' );
+ return;
+ case 'template':
+ $this->switchMode(
+ array_slice( $this->templateInsertionModes, -1 )[0]
+ );
+ return;
+ case 'body':
+ $this->switchMode( 'inBodyMode' );
+ return;
+ // OMITTED: <frameset>
+ // OMITTED: <html>
+ // OMITTED: <head>
+ default:
+ if ( !$last ) {
+ // OMITTED: <head>
+ if ( $node->isA( BalanceSets::$tableCellSet ) ) {
+ $this->switchMode( 'inCellMode' );
+ return;
+ }
+ }
+ }
+ }
+ if ( $last ) {
+ $this->switchMode( 'inBodyMode' );
+ return;
+ }
+ }
+ }
+
+ private function stopParsing() {
+ // Most of the spec methods are inapplicable, other than step 2:
+ // "pop all the nodes off the stack of open elements".
+ // We're going to keep the top-most <html> element on the stack, though.
+
+ // Clear the AFE list first, otherwise the element objects will stay live
+ // during serialization, potentially using O(N^2) memory. Note that
+ // popping the stack will never result in reconstructing the active
+ // formatting elements.
+ $this->afe = null;
+ $this->stack->popTo( 1 );
+ }
+
+ private function parseRawText( $value, $attribs = null ) {
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->inRAWTEXT = $value;
+ $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
+ return true;
+ }
+
+ private function inTextMode( $token, $value, $attribs = null, $selfClose = false ) {
+ if ( $token === 'text' ) {
+ $this->stack->insertText( $value );
+ return true;
+ } elseif ( $token === 'eof' ) {
+ $this->stack->pop();
+ return $this->switchModeAndReprocess(
+ $this->originalInsertionMode, $token, $value, $attribs, $selfClose
+ );
+ } elseif ( $token === 'endtag' ) {
+ $this->stack->pop();
+ $this->switchMode( $this->originalInsertionMode );
+ return true;
+ }
+ return true;
+ }
+
+ private function inHeadMode( $token, $value, $attribs = null, $selfClose = false ) {
+ if ( $token === 'text' ) {
+ if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
+ $this->stack->insertText( $matches[0] );
+ $value = substr( $value, strlen( $matches[0] ) );
+ }
+ if ( strlen( $value ) === 0 ) {
+ return true; // All text handled.
+ }
+ // Fall through to handle non-whitespace below.
+ } elseif ( $token === 'tag' ) {
+ switch ( $value ) {
+ case 'meta':
+ // OMITTED: in a full HTML parser, this might change the encoding.
+ // falls through
+ // OMITTED: <html>
+ case 'base':
+ case 'basefont':
+ case 'bgsound':
+ case 'link':
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->stack->pop();
+ return true;
+ // OMITTED: <title>
+ // OMITTED: <noscript>
+ case 'noframes':
+ case 'style':
+ return $this->parseRawText( $value, $attribs );
+ // OMITTED: <script>
+ case 'template':
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->afe->insertMarker();
+ // OMITTED: frameset_ok
+ $this->switchMode( 'inTemplateMode' );
+ $this->templateInsertionModes[] = $this->parseMode;
+ return true;
+ // OMITTED: <head>
+ }
+ } elseif ( $token === 'endtag' ) {
+ switch ( $value ) {
+ // OMITTED: <head>
+ // OMITTED: <body>
+ // OMITTED: <html>
+ case 'br':
+ break; // handle at the bottom of the function
+ case 'template':
+ if ( $this->stack->indexOf( $value ) < 0 ) {
+ return true; // Ignore the token.
+ }
+ $this->stack->generateImpliedEndTags( null, true /* thorough */ );
+ $this->stack->popTag( $value );
+ $this->afe->clearToMarker();
+ array_pop( $this->templateInsertionModes );
+ $this->resetInsertionMode();
+ return true;
+ default:
+ // ignore any other end tag
+ return true;
+ }
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
+ }
+
+ // If not handled above
+ $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
+ // Then redo this one
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+
+ private function inBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
+ if ( $token === 'text' ) {
+ $this->afe->reconstruct( $this->stack );
+ $this->stack->insertText( $value );
+ return true;
+ } elseif ( $token === 'eof' ) {
+ if ( !empty( $this->templateInsertionModes ) ) {
+ return $this->inTemplateMode( $token, $value, $attribs, $selfClose );
+ }
+ $this->stopParsing();
+ return true;
+ } elseif ( $token === 'tag' ) {
+ switch ( $value ) {
+ // OMITTED: <html>
+ case 'base':
+ case 'basefont':
+ case 'bgsound':
+ case 'link':
+ case 'meta':
+ case 'noframes':
+ // OMITTED: <script>
+ case 'style':
+ case 'template':
+ // OMITTED: <title>
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
+ // OMITTED: <body>
+ // OMITTED: <frameset>
+
+ case 'address':
+ case 'article':
+ case 'aside':
+ case 'blockquote':
+ case 'center':
+ case 'details':
+ case 'dialog':
+ case 'dir':
+ case 'div':
+ case 'dl':
+ case 'fieldset':
+ case 'figcaption':
+ case 'figure':
+ case 'footer':
+ case 'header':
+ case 'hgroup':
+ case 'main':
+ case 'nav':
+ case 'ol':
+ case 'p':
+ case 'section':
+ case 'summary':
+ case 'ul':
+ if ( $this->stack->inButtonScope( 'p' ) ) {
+ $this->inBodyMode( 'endtag', 'p' );
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
+ case 'menu':
+ if ( $this->stack->inButtonScope( "p" ) ) {
+ $this->inBodyMode( 'endtag', 'p' );
+ }
+ if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+ $this->stack->pop();
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
+ case 'h1':
+ case 'h2':
+ case 'h3':
+ case 'h4':
+ case 'h5':
+ case 'h6':
+ if ( $this->stack->inButtonScope( 'p' ) ) {
+ $this->inBodyMode( 'endtag', 'p' );
+ }
+ if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
+ $this->stack->pop();
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
+ case 'pre':
+ case 'listing':
+ if ( $this->stack->inButtonScope( 'p' ) ) {
+ $this->inBodyMode( 'endtag', 'p' );
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->ignoreLinefeed = true;
+ // OMITTED: frameset_ok
+ return true;
+
+ case 'form':
+ if (
+ $this->formElementPointer &&
+ $this->stack->indexOf( 'template' ) < 0
+ ) {
+ return true; // in a form, not in a template.
+ }
+ if ( $this->stack->inButtonScope( "p" ) ) {
+ $this->inBodyMode( 'endtag', 'p' );
+ }
+ $elt = $this->stack->insertHTMLElement( $value, $attribs );
+ if ( $this->stack->indexOf( 'template' ) < 0 ) {
+ $this->formElementPointer = $elt;
+ }
+ return true;
+
+ case 'li':
+ // OMITTED: frameset_ok
+ foreach ( $this->stack as $node ) {
+ if ( $node->isHtmlNamed( 'li' ) ) {
+ $this->inBodyMode( 'endtag', 'li' );
+ break;
+ }
+ if (
+ $node->isA( BalanceSets::$specialSet ) &&
+ !$node->isA( BalanceSets::$addressDivPSet )
+ ) {
+ break;
+ }
+ }
+ if ( $this->stack->inButtonScope( 'p' ) ) {
+ $this->inBodyMode( 'endtag', 'p' );
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
+ case 'dd':
+ case 'dt':
+ // OMITTED: frameset_ok
+ foreach ( $this->stack as $node ) {
+ if ( $node->isHtmlNamed( 'dd' ) ) {
+ $this->inBodyMode( 'endtag', 'dd' );
+ break;
+ }
+ if ( $node->isHtmlNamed( 'dt' ) ) {
+ $this->inBodyMode( 'endtag', 'dt' );
+ break;
+ }
+ if (
+ $node->isA( BalanceSets::$specialSet ) &&
+ !$node->isA( BalanceSets::$addressDivPSet )
+ ) {
+ break;
+ }
+ }
+ if ( $this->stack->inButtonScope( 'p' ) ) {
+ $this->inBodyMode( 'endtag', 'p' );
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
+ // OMITTED: <plaintext>
+
+ case 'button':
+ if ( $this->stack->inScope( 'button' ) ) {
+ $this->inBodyMode( 'endtag', 'button' );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ $this->afe->reconstruct( $this->stack );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
+ case 'a':
+ $activeElement = $this->afe->findElementByTag( 'a' );
+ if ( $activeElement ) {
+ $this->inBodyMode( 'endtag', 'a' );
+ if ( $this->afe->isInList( $activeElement ) ) {
+ $this->afe->remove( $activeElement );
+ // Don't flatten here, since when we fall
+ // through below we might foster parent
+ // the new <a> tag inside this one.
+ $this->stack->removeElement( $activeElement, false );
+ }
+ }
+ // Falls through
+ case 'b':
+ case 'big':
+ case 'code':
+ case 'em':
+ case 'font':
+ case 'i':
+ case 's':
+ case 'small':
+ case 'strike':
+ case 'strong':
+ case 'tt':
+ case 'u':
+ $this->afe->reconstruct( $this->stack );
+ $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
+ return true;
+
+ case 'nobr':
+ $this->afe->reconstruct( $this->stack );
+ if ( $this->stack->inScope( 'nobr' ) ) {
+ $this->inBodyMode( 'endtag', 'nobr' );
+ $this->afe->reconstruct( $this->stack );
+ }
+ $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
+ return true;
+
+ case 'applet':
+ case 'marquee':
+ case 'object':
+ $this->afe->reconstruct( $this->stack );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->afe->insertMarker();
+ // OMITTED: frameset_ok
+ return true;
+
+ case 'table':
+ // The document is never in "quirks mode"; see simplifications
+ // above.
+ if ( $this->stack->inButtonScope( 'p' ) ) {
+ $this->inBodyMode( 'endtag', 'p' );
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ // OMITTED: frameset_ok
+ $this->switchMode( 'inTableMode' );
+ return true;
+
+ case 'area':
+ case 'br':
+ case 'embed':
+ case 'img':
+ case 'keygen':
+ case 'wbr':
+ $this->afe->reconstruct( $this->stack );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->stack->pop();
+ // OMITTED: frameset_ok
+ return true;
+
+ case 'input':
+ $this->afe->reconstruct( $this->stack );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->stack->pop();
+ // OMITTED: frameset_ok
+ // (hence we don't need to examine the tag's "type" attribute)
+ return true;
+
+ case 'param':
+ case 'source':
+ case 'track':
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->stack->pop();
+ return true;
+
+ case 'hr':
+ if ( $this->stack->inButtonScope( 'p' ) ) {
+ $this->inBodyMode( 'endtag', 'p' );
+ }
+ if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+ $this->stack->pop();
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->stack->pop();
+ return true;
+
+ case 'image':
+ // warts!
+ return $this->inBodyMode( $token, 'img', $attribs, $selfClose );
+
+ case 'textarea':
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->ignoreLinefeed = true;
+ $this->inRCDATA = $value; // emulate rcdata tokenizer mode
+ // OMITTED: frameset_ok
+ return true;
+
+ // OMITTED: <xmp>
+ // OMITTED: <iframe>
+ // OMITTED: <noembed>
+ // OMITTED: <noscript>
+
+ case 'select':
+ $this->afe->reconstruct( $this->stack );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ switch ( $this->parseMode ) {
+ case 'inTableMode':
+ case 'inCaptionMode':
+ case 'inTableBodyMode':
+ case 'inRowMode':
+ case 'inCellMode':
+ $this->switchMode( 'inSelectInTableMode' );
+ return true;
+ default:
+ $this->switchMode( 'inSelectMode' );
+ return true;
+ }
+
+ case 'optgroup':
+ case 'option':
+ if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
+ $this->inBodyMode( 'endtag', 'option' );
+ }
+ $this->afe->reconstruct( $this->stack );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
+ case 'menuitem':
+ if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+ $this->stack->pop();
+ }
+ $this->afe->reconstruct( $this->stack );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
+ case 'rb':
+ case 'rtc':
+ if ( $this->stack->inScope( 'ruby' ) ) {
+ $this->stack->generateImpliedEndTags();
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
+ case 'rp':
+ case 'rt':
+ if ( $this->stack->inScope( 'ruby' ) ) {
+ $this->stack->generateImpliedEndTags( 'rtc' );
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
+ case 'math':
+ $this->afe->reconstruct( $this->stack );
+ // We skip the spec's "adjust MathML attributes" and
+ // "adjust foreign attributes" steps, since the browser will
+ // do this later when it parses the output and it doesn't affect
+ // balancing.
+ $this->stack->insertForeignElement(
+ BalanceSets::MATHML_NAMESPACE, $value, $attribs
+ );
+ if ( $selfClose ) {
+ // emit explicit </math> tag.
+ $this->stack->pop();
+ }
+ return true;
+
+ case 'svg':
+ $this->afe->reconstruct( $this->stack );
+ // We skip the spec's "adjust SVG attributes" and
+ // "adjust foreign attributes" steps, since the browser will
+ // do this later when it parses the output and it doesn't affect
+ // balancing.
+ $this->stack->insertForeignElement(
+ BalanceSets::SVG_NAMESPACE, $value, $attribs
+ );
+ if ( $selfClose ) {
+ // emit explicit </svg> tag.
+ $this->stack->pop();
+ }
+ return true;
+
+ case 'caption':
+ case 'col':
+ case 'colgroup':
+ // OMITTED: <frame>
+ case 'head':
+ case 'tbody':
+ case 'td':
+ case 'tfoot':
+ case 'th':
+ case 'thead':
+ case 'tr':
+ // Ignore table tags if we're not inTableMode
+ return true;
+ }
+
+ // Handle any other start tag here
+ $this->afe->reconstruct( $this->stack );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+ } elseif ( $token === 'endtag' ) {
+ switch ( $value ) {
+ // </body>,</html> are unsupported.
+
+ case 'template':
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
+
+ case 'address':
+ case 'article':
+ case 'aside':
+ case 'blockquote':
+ case 'button':
+ case 'center':
+ case 'details':
+ case 'dialog':
+ case 'dir':
+ case 'div':
+ case 'dl':
+ case 'fieldset':
+ case 'figcaption':
+ case 'figure':
+ case 'footer':
+ case 'header':
+ case 'hgroup':
+ case 'listing':
+ case 'main':
+ case 'menu':
+ case 'nav':
+ case 'ol':
+ case 'pre':
+ case 'section':
+ case 'summary':
+ case 'ul':
+ // Ignore if there is not a matching open tag
+ if ( !$this->stack->inScope( $value ) ) {
+ return true;
+ }
+ $this->stack->generateImpliedEndTags();
+ $this->stack->popTag( $value );
+ return true;
+
+ case 'form':
+ if ( $this->stack->indexOf( 'template' ) < 0 ) {
+ $openform = $this->formElementPointer;
+ $this->formElementPointer = null;
+ if ( !$openform || !$this->stack->inScope( $openform ) ) {
+ return true;
+ }
+ $this->stack->generateImpliedEndTags();
+ // Don't flatten yet if we're removing a <form> element
+ // out-of-order. (eg. `<form><div></form>`)
+ $flatten = ( $this->stack->currentNode === $openform );
+ $this->stack->removeElement( $openform, $flatten );
+ } else {
+ if ( !$this->stack->inScope( 'form' ) ) {
+ return true;
+ }
+ $this->stack->generateImpliedEndTags();
+ $this->stack->popTag( 'form' );
+ }
+ return true;
+
+ case 'p':
+ if ( !$this->stack->inButtonScope( 'p' ) ) {
+ $this->inBodyMode( 'tag', 'p', [] );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ $this->stack->generateImpliedEndTags( $value );
+ $this->stack->popTag( $value );
+ return true;
+
+ case 'li':
+ if ( !$this->stack->inListItemScope( $value ) ) {
+ return true; // ignore
+ }
+ $this->stack->generateImpliedEndTags( $value );
+ $this->stack->popTag( $value );
+ return true;
+
+ case 'dd':
+ case 'dt':
+ if ( !$this->stack->inScope( $value ) ) {
+ return true; // ignore
+ }
+ $this->stack->generateImpliedEndTags( $value );
+ $this->stack->popTag( $value );
+ return true;
+
+ case 'h1':
+ case 'h2':
+ case 'h3':
+ case 'h4':
+ case 'h5':
+ case 'h6':
+ if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
+ return true; // ignore
+ }
+ $this->stack->generateImpliedEndTags();
+ $this->stack->popTag( BalanceSets::$headingSet );
+ return true;
+
+ case 'sarcasm':
+ // Take a deep breath, then:
+ break;
+
+ case 'a':
+ case 'b':
+ case 'big':
+ case 'code':
+ case 'em':
+ case 'font':
+ case 'i':
+ case 'nobr':
+ case 's':
+ case 'small':
+ case 'strike':
+ case 'strong':
+ case 'tt':
+ case 'u':
+ if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
+ return true; // If we did something, we're done.
+ }
+ break; // Go to the "any other end tag" case.
+
+ case 'applet':
+ case 'marquee':
+ case 'object':
+ if ( !$this->stack->inScope( $value ) ) {
+ return true; // ignore
+ }
+ $this->stack->generateImpliedEndTags();
+ $this->stack->popTag( $value );
+ $this->afe->clearToMarker();
+ return true;
+
+ case 'br':
+ // Turn </br> into <br>
+ return $this->inBodyMode( 'tag', $value, [] );
+ }
+
+ // Any other end tag goes here
+ foreach ( $this->stack as $i => $node ) {
+ if ( $node->isHtmlNamed( $value ) ) {
+ $this->stack->generateImpliedEndTags( $value );
+ $this->stack->popTo( $i ); // including $i
+ break;
+ } elseif ( $node->isA( BalanceSets::$specialSet ) ) {
+ return true; // ignore this close token.
+ }
+ }
+ return true;
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
+ } else {
+ Assert::invariant( false, "Bad token type: $token" );
+ }
+ }
+
+ private function inTableMode( $token, $value, $attribs = null, $selfClose = false ) {
+ if ( $token === 'text' ) {
+ if ( $this->textIntegrationMode ) {
+ return $this->inBodyMode( $token, $value, $attribs, $selfClose );
+ } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
+ $this->pendingTableText = '';
+ $this->originalInsertionMode = $this->parseMode;
+ return $this->switchModeAndReprocess( 'inTableTextMode',
+ $token, $value, $attribs, $selfClose );
+ }
+ // fall through to default case.
+ } elseif ( $token === 'eof' ) {
+ $this->stopParsing();
+ return true;
+ } elseif ( $token === 'tag' ) {
+ switch ( $value ) {
+ case 'caption':
+ $this->afe->insertMarker();
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->switchMode( 'inCaptionMode' );
+ return true;
+ case 'colgroup':
+ $this->stack->clearToContext( BalanceSets::$tableContextSet );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->switchMode( 'inColumnGroupMode' );
+ return true;
+ case 'col':
+ $this->inTableMode( 'tag', 'colgroup', [] );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
+ case 'tbody':
+ case 'tfoot':
+ case 'thead':
+ $this->stack->clearToContext( BalanceSets::$tableContextSet );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->switchMode( 'inTableBodyMode' );
+ return true;
+ case 'td':
+ case 'th':
+ case 'tr':
+ $this->inTableMode( 'tag', 'tbody', [] );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
+ case 'table':
+ if ( !$this->stack->inTableScope( $value ) ) {
+ return true; // Ignore this tag.
+ }
+ $this->inTableMode( 'endtag', $value );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
+
+ case 'style':
+ // OMITTED: <script>
+ case 'template':
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
+
+ case 'input':
+ if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
+ break; // Handle this as "everything else"
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->stack->pop();
+ return true;
+
+ case 'form':
+ if (
+ $this->formElementPointer ||
+ $this->stack->indexOf( 'template' ) >= 0
+ ) {
+ return true; // ignore this token
+ }
+ $this->formElementPointer =
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->stack->popTag( $this->formElementPointer );
+ return true;
+ }
+ // Fall through for "anything else" clause.
+ } elseif ( $token === 'endtag' ) {
+ switch ( $value ) {
+ case 'table':
+ if ( !$this->stack->inTableScope( $value ) ) {
+ return true; // Ignore.
+ }
+ $this->stack->popTag( $value );
+ $this->resetInsertionMode();
+ return true;
+ // OMITTED: <body>
+ case 'caption':
+ case 'col':
+ case 'colgroup':
+ // OMITTED: <html>
+ case 'tbody':
+ case 'td':
+ case 'tfoot':
+ case 'th':
+ case 'thead':
+ case 'tr':
+ return true; // Ignore the token.
+ case 'template':
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
+ }
+ // Fall through for "anything else" clause.
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
+ }
+ // This is the "anything else" case:
+ $this->stack->fosterParentMode = true;
+ $this->inBodyMode( $token, $value, $attribs, $selfClose );
+ $this->stack->fosterParentMode = false;
+ return true;
+ }
+
+ private function inTableTextMode( $token, $value, $attribs = null, $selfClose = false ) {
+ if ( $token === 'text' ) {
+ $this->pendingTableText .= $value;
+ return true;
+ }
+ // Non-text token:
+ $text = $this->pendingTableText;
+ $this->pendingTableText = '';
+ if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
+ // This should match the "anything else" case inTableMode
+ $this->stack->fosterParentMode = true;
+ $this->inBodyMode( 'text', $text );
+ $this->stack->fosterParentMode = false;
+ } else {
+ // Pending text is just whitespace.
+ $this->stack->insertText( $text );
+ }
+ return $this->switchModeAndReprocess(
+ $this->originalInsertionMode, $token, $value, $attribs, $selfClose
+ );
+ }
+
+ // helper for inCaptionMode
+ private function endCaption() {
+ if ( !$this->stack->inTableScope( 'caption' ) ) {
+ return false;
+ }
+ $this->stack->generateImpliedEndTags();
+ $this->stack->popTag( 'caption' );
+ $this->afe->clearToMarker();
+ $this->switchMode( 'inTableMode' );
+ return true;
+ }
+
+ private function inCaptionMode( $token, $value, $attribs = null, $selfClose = false ) {
+ if ( $token === 'tag' ) {
+ switch ( $value ) {
+ case 'caption':
+ case 'col':
+ case 'colgroup':
+ case 'tbody':
+ case 'td':
+ case 'tfoot':
+ case 'th':
+ case 'thead':
+ case 'tr':
+ if ( $this->endCaption() ) {
+ $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ return true;
+ }
+ // Fall through to "anything else" case.
+ } elseif ( $token === 'endtag' ) {
+ switch ( $value ) {
+ case 'caption':
+ $this->endCaption();
+ return true;
+ case 'table':
+ if ( $this->endCaption() ) {
+ $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ return true;
+ case 'body':
+ case 'col':
+ case 'colgroup':
+ // OMITTED: <html>
+ case 'tbody':
+ case 'td':
+ case 'tfoot':
+ case 'th':
+ case 'thead':
+ case 'tr':
+ // Ignore the token
+ return true;
+ }
+ // Fall through to "anything else" case.
+ }
+ // The Anything Else case
+ return $this->inBodyMode( $token, $value, $attribs, $selfClose );
+ }
+
+ private function inColumnGroupMode( $token, $value, $attribs = null, $selfClose = false ) {
+ if ( $token === 'text' ) {
+ if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
+ $this->stack->insertText( $matches[0] );
+ $value = substr( $value, strlen( $matches[0] ) );
+ }
+ if ( strlen( $value ) === 0 ) {
+ return true; // All text handled.
+ }
+ // Fall through to handle non-whitespace below.
+ } elseif ( $token === 'tag' ) {
+ switch ( $value ) {
+ // OMITTED: <html>
+ case 'col':
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->stack->pop();
+ return true;
+ case 'template':
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
+ }
+ // Fall through for "anything else".
+ } elseif ( $token === 'endtag' ) {
+ switch ( $value ) {
+ case 'colgroup':
+ if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
+ return true; // Ignore the token.
+ }
+ $this->stack->pop();
+ $this->switchMode( 'inTableMode' );
+ return true;
+ case 'col':
+ return true; // Ignore the token.
+ case 'template':
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
+ }
+ // Fall through for "anything else".
+ } elseif ( $token === 'eof' ) {
+ return $this->inBodyMode( $token, $value, $attribs, $selfClose );
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
+ }
+
+ // Anything else
+ if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
+ return true; // Ignore the token.
+ }
+ $this->inColumnGroupMode( 'endtag', 'colgroup' );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+
+ // Helper function for inTableBodyMode
+ private function endSection() {
+ if ( !(
+ $this->stack->inTableScope( 'tbody' ) ||
+ $this->stack->inTableScope( 'thead' ) ||
+ $this->stack->inTableScope( 'tfoot' )
+ ) ) {
+ return false;
+ }
+ $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
+ $this->stack->pop();
+ $this->switchMode( 'inTableMode' );
+ return true;
+ }
+ private function inTableBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
+ if ( $token === 'tag' ) {
+ switch ( $value ) {
+ case 'tr':
+ $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->switchMode( 'inRowMode' );
+ return true;
+ case 'th':
+ case 'td':
+ $this->inTableBodyMode( 'tag', 'tr', [] );
+ $this->insertToken( $token, $value, $attribs, $selfClose );
+ return true;
+ case 'caption':
+ case 'col':
+ case 'colgroup':
+ case 'tbody':
+ case 'tfoot':
+ case 'thead':
+ if ( $this->endSection() ) {
+ $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ return true;
+ }
+ } elseif ( $token === 'endtag' ) {
+ switch ( $value ) {
+ case 'table':
+ if ( $this->endSection() ) {
+ $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ return true;
+ case 'tbody':
+ case 'tfoot':
+ case 'thead':
+ if ( $this->stack->inTableScope( $value ) ) {
+ $this->endSection();
+ }
+ return true;
+ // OMITTED: <body>
+ case 'caption':
+ case 'col':
+ case 'colgroup':
+ // OMITTED: <html>
+ case 'td':
+ case 'th':
+ case 'tr':
+ return true; // Ignore the token.
+ }
+ }
+ // Anything else:
+ return $this->inTableMode( $token, $value, $attribs, $selfClose );
+ }
+
+ // Helper function for inRowMode
+ private function endRow() {
+ if ( !$this->stack->inTableScope( 'tr' ) ) {
+ return false;
+ }
+ $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
+ $this->stack->pop();
+ $this->switchMode( 'inTableBodyMode' );
+ return true;
+ }
+ private function inRowMode( $token, $value, $attribs = null, $selfClose = false ) {
+ if ( $token === 'tag' ) {
+ switch ( $value ) {
+ case 'th':
+ case 'td':
+ $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->switchMode( 'inCellMode' );
+ $this->afe->insertMarker();
+ return true;
+ case 'caption':
+ case 'col':
+ case 'colgroup':
+ case 'tbody':
+ case 'tfoot':
+ case 'thead':
+ case 'tr':
+ if ( $this->endRow() ) {
+ $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ return true;
+ }
+ } elseif ( $token === 'endtag' ) {
+ switch ( $value ) {
+ case 'tr':
+ $this->endRow();
+ return true;
+ case 'table':
+ if ( $this->endRow() ) {
+ $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ return true;
+ case 'tbody':
+ case 'tfoot':
+ case 'thead':
+ if (
+ $this->stack->inTableScope( $value ) &&
+ $this->endRow()
+ ) {
+ $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ return true;
+ // OMITTED: <body>
+ case 'caption':
+ case 'col':
+ case 'colgroup':
+ // OMITTED: <html>
+ case 'td':
+ case 'th':
+ return true; // Ignore the token.
+ }
+ }
+ // Anything else:
+ return $this->inTableMode( $token, $value, $attribs, $selfClose );
+ }
+
+ // Helper for inCellMode
+ private function endCell() {
+ if ( $this->stack->inTableScope( 'td' ) ) {
+ $this->inCellMode( 'endtag', 'td' );
+ return true;
+ } elseif ( $this->stack->inTableScope( 'th' ) ) {
+ $this->inCellMode( 'endtag', 'th' );
+ return true;
+ } else {
+ return false;
+ }
+ }
+ private function inCellMode( $token, $value, $attribs = null, $selfClose = false ) {
+ if ( $token === 'tag' ) {
+ switch ( $value ) {
+ case 'caption':
+ case 'col':
+ case 'colgroup':
+ case 'tbody':
+ case 'td':
+ case 'tfoot':
+ case 'th':
+ case 'thead':
+ case 'tr':
+ if ( $this->endCell() ) {
+ $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ return true;
+ }
+ } elseif ( $token === 'endtag' ) {
+ switch ( $value ) {
+ case 'td':
+ case 'th':
+ if ( $this->stack->inTableScope( $value ) ) {
+ $this->stack->generateImpliedEndTags();
+ $this->stack->popTag( $value );
+ $this->afe->clearToMarker();
+ $this->switchMode( 'inRowMode' );
+ }
+ return true;
+ // OMITTED: <body>
+ case 'caption':
+ case 'col':
+ case 'colgroup':
+ // OMITTED: <html>
+ return true;
+
+ case 'table':
+ case 'tbody':
+ case 'tfoot':
+ case 'thead':
+ case 'tr':
+ if ( $this->stack->inTableScope( $value ) ) {
+ $this->stack->generateImpliedEndTags();
+ $this->stack->popTag( BalanceSets::$tableCellSet );
+ $this->afe->clearToMarker();
+ $this->switchMode( 'inRowMode' );
+ $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ return true;
+ }
+ }
+ // Anything else:
+ return $this->inBodyMode( $token, $value, $attribs, $selfClose );
+ }
+
+ private function inSelectMode( $token, $value, $attribs = null, $selfClose = false ) {
+ if ( $token === 'text' ) {
+ $this->stack->insertText( $value );
+ return true;
+ } elseif ( $token === 'eof' ) {
+ return $this->inBodyMode( $token, $value, $attribs, $selfClose );
+ } elseif ( $token === 'tag' ) {
+ switch ( $value ) {
+ // OMITTED: <html>
+ case 'option':
+ if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
+ $this->stack->pop();
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+ case 'optgroup':
+ if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
+ $this->stack->pop();
+ }
+ if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
+ $this->stack->pop();
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+ case 'select':
+ $this->inSelectMode( 'endtag', $value ); // treat it like endtag
+ return true;
+ case 'input':
+ case 'keygen':
+ case 'textarea':
+ if ( !$this->stack->inSelectScope( 'select' ) ) {
+ return true; // ignore token (fragment case)
+ }
+ $this->inSelectMode( 'endtag', 'select' );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
+ case 'script':
+ case 'template':
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
+ }
+ } elseif ( $token === 'endtag' ) {
+ switch ( $value ) {
+ case 'optgroup':
+ if (
+ $this->stack->currentNode->isHtmlNamed( 'option' ) &&
+ $this->stack->length() >= 2 &&
+ $this->stack->node( $this->stack->length() - 2 )->isHtmlNamed( 'optgroup' )
+ ) {
+ $this->stack->pop();
+ }
+ if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
+ $this->stack->pop();
+ }
+ return true;
+ case 'option':
+ if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
+ $this->stack->pop();
+ }
+ return true;
+ case 'select':
+ if ( !$this->stack->inSelectScope( $value ) ) {
+ return true; // fragment case
+ }
+ $this->stack->popTag( $value );
+ $this->resetInsertionMode();
+ return true;
+ case 'template':
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
+ }
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
+ }
+ // anything else: just ignore the token
+ return true;
+ }
+
+ private function inSelectInTableMode( $token, $value, $attribs = null, $selfClose = false ) {
+ switch ( $value ) {
+ case 'caption':
+ case 'table':
+ case 'tbody':
+ case 'tfoot':
+ case 'thead':
+ case 'tr':
+ case 'td':
+ case 'th':
+ if ( $token === 'tag' ) {
+ $this->inSelectInTableMode( 'endtag', 'select' );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
+ } elseif ( $token === 'endtag' ) {
+ if ( $this->stack->inTableScope( $value ) ) {
+ $this->inSelectInTableMode( 'endtag', 'select' );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ return true;
+ }
+ }
+ // anything else
+ return $this->inSelectMode( $token, $value, $attribs, $selfClose );
+ }
+
+ private function inTemplateMode( $token, $value, $attribs = null, $selfClose = false ) {
+ if ( $token === 'text' || $token === 'comment' ) {
+ return $this->inBodyMode( $token, $value, $attribs, $selfClose );
+ } elseif ( $token === 'eof' ) {
+ if ( $this->stack->indexOf( 'template' ) < 0 ) {
+ $this->stopParsing();
+ } else {
+ $this->stack->popTag( 'template' );
+ $this->afe->clearToMarker();
+ array_pop( $this->templateInsertionModes );
+ $this->resetInsertionMode();
+ $this->insertToken( $token, $value, $attribs, $selfClose );
+ }
+ return true;
+ } elseif ( $token === 'tag' ) {
+ switch ( $value ) {
+ case 'base':
+ case 'basefont':
+ case 'bgsound':
+ case 'link':
+ case 'meta':
+ case 'noframes':
+ // OMITTED: <script>
+ case 'style':
+ case 'template':
+ // OMITTED: <title>
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
+
+ case 'caption':
+ case 'colgroup':
+ case 'tbody':
+ case 'tfoot':
+ case 'thead':
+ return $this->switchModeAndReprocess(
+ 'inTableMode', $token, $value, $attribs, $selfClose
+ );
+
+ case 'col':
+ return $this->switchModeAndReprocess(
+ 'inColumnGroupMode', $token, $value, $attribs, $selfClose
+ );
+
+ case 'tr':
+ return $this->switchModeAndReprocess(
+ 'inTableBodyMode', $token, $value, $attribs, $selfClose
+ );
+
+ case 'td':
+ case 'th':
+ return $this->switchModeAndReprocess(
+ 'inRowMode', $token, $value, $attribs, $selfClose
+ );
+ }
+ return $this->switchModeAndReprocess(
+ 'inBodyMode', $token, $value, $attribs, $selfClose
+ );
+ } elseif ( $token === 'endtag' ) {
+ switch ( $value ) {
+ case 'template':
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
+ }
+ return true;
+ } else {
+ Assert::invariant( false, "Bad token type: $token" );
+ }
+ }
+}