diff options
Diffstat (limited to 'www/wiki/includes/tidy/Balancer.php')
-rw-r--r-- | www/wiki/includes/tidy/Balancer.php | 3584 |
1 files changed, 3584 insertions, 0 deletions
diff --git a/www/wiki/includes/tidy/Balancer.php b/www/wiki/includes/tidy/Balancer.php new file mode 100644 index 00000000..6671f49b --- /dev/null +++ b/www/wiki/includes/tidy/Balancer.php @@ -0,0 +1,3584 @@ +<?php +/** + * An implementation of the tree building portion of the HTML5 parsing + * spec. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Parser + * @since 1.27 + * @author C. Scott Ananian, 2016 + */ + +namespace MediaWiki\Tidy; + +use ExplodeIterator; +use IteratorAggregate; +use ReverseArrayIterator; +use Sanitizer; +use Wikimedia\Assert\Assert; +use Wikimedia\Assert\ParameterAssertionException; + +// A note for future librarization[1] -- this file is a good candidate +// for splitting into an independent library, except that it is currently +// highly optimized for MediaWiki use. It only implements the portions +// of the HTML5 tree builder used by tags supported by MediaWiki, and +// does not contain a true tokenizer pass, instead relying on +// comment stripping, attribute normalization, and escaping done by +// the MediaWiki Sanitizer. It also deliberately avoids building +// a true DOM in memory, instead serializing elements to an output string +// as soon as possible (usually as soon as the tag is closed) to reduce +// its memory footprint. + +// We've been gradually lifting some of these restrictions to handle +// non-sanitized output generated by extensions, but we shortcut the tokenizer +// for speed (primarily by splitting on `<`) and so rely on syntactic +// well-formedness. + +// On the other hand, I've been pretty careful to note with comments in the +// code the places where this implementation omits features of the spec or +// depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to +// implement the missing pieces and make this a standalone PHP HTML5 parser. +// In order to do so, some sort of MediaWiki-specific API will need +// to be added to (a) allow the Balancer to bypass the tokenizer, +// and (b) support on-the-fly flattening instead of DOM node creation. + +// [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki + +/** + * Utility constants and sets for the HTML5 tree building algorithm. + * Sets are associative arrays indexed first by namespace and then by + * lower-cased tag name. + * + * @ingroup Parser + * @since 1.27 + */ +class BalanceSets { + const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml'; + const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML'; + const SVG_NAMESPACE = 'http://www.w3.org/2000/svg'; + + public static $unsupportedSet = [ + self::HTML_NAMESPACE => [ + 'html' => true, 'head' => true, 'body' => true, 'frameset' => true, + 'frame' => true, + 'plaintext' => true, + 'xmp' => true, 'iframe' => true, 'noembed' => true, + 'noscript' => true, 'script' => true, + 'title' => true + ] + ]; + + public static $emptyElementSet = [ + self::HTML_NAMESPACE => [ + 'area' => true, 'base' => true, 'basefont' => true, + 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true, + 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true, + 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true, + 'param' => true, 'source' => true, 'track' => true, 'wbr' => true + ] + ]; + + public static $extraLinefeedSet = [ + self::HTML_NAMESPACE => [ + 'pre' => true, 'textarea' => true, 'listing' => true, + ] + ]; + + public static $headingSet = [ + self::HTML_NAMESPACE => [ + 'h1' => true, 'h2' => true, 'h3' => true, + 'h4' => true, 'h5' => true, 'h6' => true + ] + ]; + + public static $specialSet = [ + self::HTML_NAMESPACE => [ + 'address' => true, 'applet' => true, 'area' => true, + 'article' => true, 'aside' => true, 'base' => true, + 'basefont' => true, 'bgsound' => true, 'blockquote' => true, + 'body' => true, 'br' => true, 'button' => true, 'caption' => true, + 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true, + 'details' => true, 'dir' => true, 'div' => true, 'dl' => true, + 'dt' => true, 'embed' => true, 'fieldset' => true, + 'figcaption' => true, 'figure' => true, 'footer' => true, + 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true, + 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true, + 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true, + 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true, + 'input' => true, 'li' => true, 'link' => true, + 'listing' => true, 'main' => true, 'marquee' => true, + 'menu' => true, 'meta' => true, 'nav' => true, + 'noembed' => true, 'noframes' => true, 'noscript' => true, + 'object' => true, 'ol' => true, 'p' => true, 'param' => true, + 'plaintext' => true, 'pre' => true, 'script' => true, + 'section' => true, 'select' => true, 'source' => true, + 'style' => true, 'summary' => true, 'table' => true, + 'tbody' => true, 'td' => true, 'template' => true, + 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true, + 'title' => true, 'tr' => true, 'track' => true, 'ul' => true, + 'wbr' => true, 'xmp' => true + ], + self::SVG_NAMESPACE => [ + 'foreignobject' => true, 'desc' => true, 'title' => true + ], + self::MATHML_NAMESPACE => [ + 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true, + 'mtext' => true, 'annotation-xml' => true + ] + ]; + + public static $addressDivPSet = [ + self::HTML_NAMESPACE => [ + 'address' => true, 'div' => true, 'p' => true + ] + ]; + + public static $tableSectionRowSet = [ + self::HTML_NAMESPACE => [ + 'table' => true, 'thead' => true, 'tbody' => true, + 'tfoot' => true, 'tr' => true + ] + ]; + + public static $impliedEndTagsSet = [ + self::HTML_NAMESPACE => [ + 'dd' => true, 'dt' => true, 'li' => true, + 'menuitem' => true, 'optgroup' => true, + 'option' => true, 'p' => true, 'rb' => true, 'rp' => true, + 'rt' => true, 'rtc' => true + ] + ]; + + public static $thoroughImpliedEndTagsSet = [ + self::HTML_NAMESPACE => [ + 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true, + 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true, + 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true, + 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true, + 'thead' => true, 'tr' => true + ] + ]; + + public static $tableCellSet = [ + self::HTML_NAMESPACE => [ + 'td' => true, 'th' => true + ] + ]; + public static $tableContextSet = [ + self::HTML_NAMESPACE => [ + 'table' => true, 'template' => true, 'html' => true + ] + ]; + + public static $tableBodyContextSet = [ + self::HTML_NAMESPACE => [ + 'tbody' => true, 'tfoot' => true, 'thead' => true, + 'template' => true, 'html' => true + ] + ]; + + public static $tableRowContextSet = [ + self::HTML_NAMESPACE => [ + 'tr' => true, 'template' => true, 'html' => true + ] + ]; + + // See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element + public static $formAssociatedSet = [ + self::HTML_NAMESPACE => [ + 'button' => true, 'fieldset' => true, 'input' => true, + 'keygen' => true, 'object' => true, 'output' => true, + 'select' => true, 'textarea' => true, 'img' => true + ] + ]; + + public static $inScopeSet = [ + self::HTML_NAMESPACE => [ + 'applet' => true, 'caption' => true, 'html' => true, + 'marquee' => true, 'object' => true, + 'table' => true, 'td' => true, 'template' => true, + 'th' => true + ], + self::SVG_NAMESPACE => [ + 'foreignobject' => true, 'desc' => true, 'title' => true + ], + self::MATHML_NAMESPACE => [ + 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true, + 'mtext' => true, 'annotation-xml' => true + ] + ]; + + private static $inListItemScopeSet = null; + public static function inListItemScopeSet() { + if ( self::$inListItemScopeSet === null ) { + self::$inListItemScopeSet = self::$inScopeSet; + self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true; + self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true; + } + return self::$inListItemScopeSet; + } + + private static $inButtonScopeSet = null; + public static function inButtonScopeSet() { + if ( self::$inButtonScopeSet === null ) { + self::$inButtonScopeSet = self::$inScopeSet; + self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true; + } + return self::$inButtonScopeSet; + } + + public static $inTableScopeSet = [ + self::HTML_NAMESPACE => [ + 'html' => true, 'table' => true, 'template' => true + ] + ]; + + public static $inInvertedSelectScopeSet = [ + self::HTML_NAMESPACE => [ + 'option' => true, 'optgroup' => true + ] + ]; + + public static $mathmlTextIntegrationPointSet = [ + self::MATHML_NAMESPACE => [ + 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true, + 'mtext' => true + ] + ]; + + public static $htmlIntegrationPointSet = [ + self::SVG_NAMESPACE => [ + 'foreignobject' => true, + 'desc' => true, + 'title' => true + ] + ]; + + // For tidy compatibility. + public static $tidyPWrapSet = [ + self::HTML_NAMESPACE => [ + 'body' => true, 'blockquote' => true, + // We parse with <body> as the fragment context, but the top-level + // element on the stack is actually <html>. We could use the + // "adjusted current node" everywhere to work around this, but it's + // easier just to add <html> to the p-wrap set. + 'html' => true, + ], + ]; + public static $tidyInlineSet = [ + self::HTML_NAMESPACE => [ + 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true, + 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true, + 'br' => true, 'button' => true, 'cite' => true, 'code' => true, + 'dfn' => true, 'em' => true, 'font' => true, 'i' => true, + 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true, + 'label' => true, 'legend' => true, 'map' => true, 'object' => true, + 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true, + 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true, + 's' => true, 'samp' => true, 'select' => true, 'small' => true, + 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true, + 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true, + 'var' => true, + // Those defined in tidy.conf + 'video' => true, 'audio' => true, 'bdi' => true, 'data' => true, + 'time' => true, 'mark' => true, + ], + ]; +} + +/** + * A BalanceElement is a simplified version of a DOM Node. The main + * difference is that we only keep BalanceElements around for nodes + * currently on the BalanceStack of open elements. As soon as an + * element is closed, with some minor exceptions relating to the + * tree builder "adoption agency algorithm", the element and all its + * children are serialized to a string using the flatten() method. + * This keeps our memory usage low. + * + * @ingroup Parser + * @since 1.27 + */ +class BalanceElement { + /** + * The namespace of the element. + * @var string $namespaceURI + */ + public $namespaceURI; + /** + * The lower-cased name of the element. + * @var string $localName + */ + public $localName; + /** + * Attributes for the element, in array form + * @var array $attribs + */ + public $attribs; + + /** + * Parent of this element, or the string "flat" if this element has + * already been flattened into its parent. + * @var BalanceElement|string|null $parent + */ + public $parent; + + /** + * An array of children of this element. Typically only the last + * child will be an actual BalanceElement object; the rest will + * be strings, representing either text nodes or flattened + * BalanceElement objects. + * @var BalanceElement[]|string[] $children + */ + public $children; + + /** + * A unique string identifier for Noah's Ark purposes, lazy initialized + */ + private $noahKey; + + /** + * The next active formatting element in the list, or null if this is the + * end of the AFE list or if the element is not in the AFE list. + */ + public $nextAFE; + + /** + * The previous active formatting element in the list, or null if this is + * the start of the list or if the element is not in the AFE list. + */ + public $prevAFE; + + /** + * The next element in the Noah's Ark species bucket. + */ + public $nextNoah; + + /** + * Make a new BalanceElement corresponding to the HTML DOM Element + * with the given localname, namespace, and attributes. + * + * @param string $namespaceURI The namespace of the element. + * @param string $localName The lowercased name of the tag. + * @param array $attribs Attributes of the element + */ + public function __construct( $namespaceURI, $localName, array $attribs ) { + $this->localName = $localName; + $this->namespaceURI = $namespaceURI; + $this->attribs = $attribs; + $this->contents = ''; + $this->parent = null; + $this->children = []; + } + + /** + * Remove the given child from this element. + * @param BalanceElement $elt + */ + private function removeChild( BalanceElement $elt ) { + Assert::precondition( + $this->parent !== 'flat', "Can't removeChild after flattening $this" + ); + Assert::parameter( + $elt->parent === $this, 'elt', 'must have $this as a parent' + ); + $idx = array_search( $elt, $this->children, true ); + Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' ); + $elt->parent = null; + array_splice( $this->children, $idx, 1 ); + } + + /** + * Find $a in the list of children and insert $b before it. + * @param BalanceElement $a + * @param BalanceElement|string $b + */ + public function insertBefore( BalanceElement $a, $b ) { + Assert::precondition( + $this->parent !== 'flat', "Can't insertBefore after flattening." + ); + $idx = array_search( $a, $this->children, true ); + Assert::parameter( $idx !== false, '$a', 'must be a child of $this' ); + if ( is_string( $b ) ) { + array_splice( $this->children, $idx, 0, [ $b ] ); + } else { + Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" ); + if ( $b->parent !== null ) { + $b->parent->removeChild( $b ); + } + array_splice( $this->children, $idx, 0, [ $b ] ); + $b->parent = $this; + } + } + + /** + * Append $elt to the end of the list of children. + * @param BalanceElement|string $elt + */ + public function appendChild( $elt ) { + Assert::precondition( + $this->parent !== 'flat', "Can't appendChild after flattening." + ); + if ( is_string( $elt ) ) { + array_push( $this->children, $elt ); + return; + } + // Remove $elt from parent, if it had one. + if ( $elt->parent !== null ) { + $elt->parent->removeChild( $elt ); + } + array_push( $this->children, $elt ); + $elt->parent = $this; + } + + /** + * Transfer all of the children of $elt to $this. + * @param BalanceElement $elt + */ + public function adoptChildren( BalanceElement $elt ) { + Assert::precondition( + $elt->parent !== 'flat', "Can't adoptChildren after flattening." + ); + foreach ( $elt->children as $child ) { + if ( !is_string( $child ) ) { + // This is an optimization which avoids an O(n^2) set of + // array_splice operations. + $child->parent = null; + } + $this->appendChild( $child ); + } + $elt->children = []; + } + + /** + * Flatten this node and all of its children into a string, as specified + * by the HTML serialization specification, and replace this node + * in its parent by that string. + * + * @param array $config Balancer configuration; see Balancer::__construct(). + * @return string + * + * @see __toString() + */ + public function flatten( array $config ) { + Assert::parameter( $this->parent !== null, '$this', 'must be a child' ); + Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' ); + $idx = array_search( $this, $this->parent->children, true ); + Assert::parameter( + $idx !== false, '$this', 'must be a child of its parent' + ); + $tidyCompat = $config['tidyCompat']; + if ( $tidyCompat ) { + $blank = true; + foreach ( $this->children as $elt ) { + if ( !is_string( $elt ) ) { + $elt = $elt->flatten( $config ); + } + if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) { + $blank = false; + } + } + if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) { + $this->localName = 'p'; + } elseif ( $blank ) { + // Add 'mw-empty-elt' class so elements can be hidden via CSS + // for compatibility with legacy tidy. + if ( !count( $this->attribs ) && + ( $this->localName === 'tr' || $this->localName === 'li' ) + ) { + $this->attribs = [ 'class' => "mw-empty-elt" ]; + } + $blank = false; + } elseif ( + $this->isA( BalanceSets::$extraLinefeedSet ) && + count( $this->children ) > 0 && + substr( $this->children[0], 0, 1 ) == "\n" + ) { + // Double the linefeed after pre/listing/textarea + // according to the (old) HTML5 fragment serialization + // algorithm (see https://github.com/whatwg/html/issues/944) + // to ensure this will round-trip. + array_unshift( $this->children, "\n" ); + } + $flat = $blank ? '' : "{$this}"; + } else { + $flat = "{$this}"; + } + $this->parent->children[$idx] = $flat; + $this->parent = 'flat'; // for assertion checking + return $flat; + } + + /** + * Serialize this node and all of its children to a string, as specified + * by the HTML serialization specification. + * + * @return string The serialization of the BalanceElement + * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments + */ + public function __toString() { + $encAttribs = ''; + foreach ( $this->attribs as $name => $value ) { + $encValue = Sanitizer::encodeAttribute( $value ); + $encAttribs .= " $name=\"$encValue\""; + } + if ( !$this->isA( BalanceSets::$emptyElementSet ) ) { + $out = "<{$this->localName}{$encAttribs}>"; + $len = strlen( $out ); + // flatten children + foreach ( $this->children as $elt ) { + $out .= "{$elt}"; + } + $out .= "</{$this->localName}>"; + } else { + $out = "<{$this->localName}{$encAttribs} />"; + Assert::invariant( + count( $this->children ) === 0, + "Empty elements shouldn't have children." + ); + } + return $out; + } + + // Utility functions on BalanceElements. + + /** + * Determine if $this represents a specific HTML tag, is a member of + * a tag set, or is equal to another BalanceElement. + * + * @param BalanceElement|array|string $set The target BalanceElement, + * set (from the BalanceSets class), or string (HTML tag name). + * @return bool + */ + public function isA( $set ) { + if ( $set instanceof BalanceElement ) { + return $this === $set; + } elseif ( is_array( $set ) ) { + return isset( $set[$this->namespaceURI] ) && + isset( $set[$this->namespaceURI][$this->localName] ); + } else { + // assume this is an HTML element name. + return $this->isHtml() && $this->localName === $set; + } + } + + /** + * Determine if this element is an HTML element with the specified name + * @param string $tagName + * @return bool + */ + public function isHtmlNamed( $tagName ) { + return $this->namespaceURI === BalanceSets::HTML_NAMESPACE + && $this->localName === $tagName; + } + + /** + * Determine if $this represents an element in the HTML namespace. + * + * @return bool + */ + public function isHtml() { + return $this->namespaceURI === BalanceSets::HTML_NAMESPACE; + } + + /** + * Determine if $this represents a MathML text integration point, + * as defined in the HTML5 specification. + * + * @return bool + * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point + */ + public function isMathmlTextIntegrationPoint() { + return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet ); + } + + /** + * Determine if $this represents an HTML integration point, + * as defined in the HTML5 specification. + * + * @return bool + * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point + */ + public function isHtmlIntegrationPoint() { + if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) { + return true; + } + if ( + $this->namespaceURI === BalanceSets::MATHML_NAMESPACE && + $this->localName === 'annotation-xml' && + isset( $this->attribs['encoding'] ) && + ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 || + strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 ) + ) { + return true; + } + return false; + } + + /** + * Get a string key for the Noah's Ark algorithm + * @return string + */ + public function getNoahKey() { + if ( $this->noahKey === null ) { + $attribs = $this->attribs; + ksort( $attribs ); + $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] ); + } + return $this->noahKey; + } +} + +/** + * The "stack of open elements" as defined in the HTML5 tree builder + * spec. This contains methods to ensure that content (start tags, text) + * are inserted at the correct place in the output string, and to + * flatten BalanceElements are they are closed to avoid holding onto + * a complete DOM tree for the document in memory. + * + * The stack defines a PHP iterator to traverse it in "reverse order", + * that is, the most-recently-added element is visited first in a + * foreach loop. + * + * @ingroup Parser + * @since 1.27 + * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements + */ +class BalanceStack implements IteratorAggregate { + /** + * Backing storage for the stack. + * @var BalanceElement[] $elements + */ + private $elements = []; + /** + * Foster parent mode determines how nodes are inserted into the + * stack. + * @var bool $fosterParentMode + * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent + */ + public $fosterParentMode = false; + /** + * Configuration options governing flattening. + * @var array $config + * @see Balancer::__construct() + */ + private $config; + /** + * Reference to the current element + */ + public $currentNode; + + /** + * Create a new BalanceStack with a single BalanceElement on it, + * representing the root <html> node. + * @param array $config Balancer configuration; see Balancer::_construct(). + */ + public function __construct( array $config ) { + // always a root <html> element on the stack + array_push( + $this->elements, + new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] ) + ); + $this->currentNode = $this->elements[0]; + $this->config = $config; + } + + /** + * Return a string representing the output of the tree builder: + * all the children of the root <html> node. + * @return string + */ + public function getOutput() { + // Don't include the outer '<html>....</html>' + $out = ''; + foreach ( $this->elements[0]->children as $elt ) { + $out .= is_string( $elt ) ? $elt : + $elt->flatten( $this->config ); + } + return $out; + } + + /** + * Insert a comment at the appropriate place for inserting a node. + * @param string $value Content of the comment. + * @return string + * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment + */ + public function insertComment( $value ) { + // Just another type of text node, except for tidy p-wrapping. + return $this->insertText( '<!--' . $value . '-->', true ); + } + + /** + * Insert text at the appropriate place for inserting a node. + * @param string $value + * @param bool $isComment + * @return string + * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node + */ + public function insertText( $value, $isComment = false ) { + if ( + $this->fosterParentMode && + $this->currentNode->isA( BalanceSets::$tableSectionRowSet ) + ) { + $this->fosterParent( $value ); + } elseif ( + $this->config['tidyCompat'] && !$isComment && + $this->currentNode->isA( BalanceSets::$tidyPWrapSet ) + ) { + $this->insertHTMLElement( 'mw:p-wrap', [] ); + return $this->insertText( $value ); + } else { + $this->currentNode->appendChild( $value ); + } + } + + /** + * Insert a BalanceElement at the appropriate place, pushing it + * on to the open elements stack. + * @param string $namespaceURI The element namespace + * @param string $tag The tag name + * @param string $attribs Normalized attributes, as a string. + * @return BalanceElement + * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element + */ + public function insertForeignElement( $namespaceURI, $tag, $attribs ) { + return $this->insertElement( + new BalanceElement( $namespaceURI, $tag, $attribs ) + ); + } + + /** + * Insert an HTML element at the appropriate place, pushing it on to + * the open elements stack. + * @param string $tag The tag name + * @param string $attribs Normalized attributes, as a string. + * @return BalanceElement + * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element + */ + public function insertHTMLElement( $tag, $attribs ) { + return $this->insertForeignElement( + BalanceSets::HTML_NAMESPACE, $tag, $attribs + ); + } + + /** + * Insert an element at the appropriate place and push it on to the + * open elements stack. + * @param BalanceElement $elt + * @return BalanceElement + * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node + */ + public function insertElement( BalanceElement $elt ) { + if ( + $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) && + !$elt->isA( BalanceSets::$tidyInlineSet ) + ) { + // Tidy compatibility. + $this->pop(); + } + if ( + $this->fosterParentMode && + $this->currentNode->isA( BalanceSets::$tableSectionRowSet ) + ) { + $elt = $this->fosterParent( $elt ); + } else { + $this->currentNode->appendChild( $elt ); + } + Assert::invariant( $elt->parent !== null, "$elt must be in tree" ); + Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" ); + array_push( $this->elements, $elt ); + $this->currentNode = $elt; + return $elt; + } + + /** + * Determine if the stack has $tag in scope. + * @param BalanceElement|array|string $tag + * @return bool + * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope + */ + public function inScope( $tag ) { + return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet ); + } + + /** + * Determine if the stack has $tag in button scope. + * @param BalanceElement|array|string $tag + * @return bool + * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope + */ + public function inButtonScope( $tag ) { + return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() ); + } + + /** + * Determine if the stack has $tag in list item scope. + * @param BalanceElement|array|string $tag + * @return bool + * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope + */ + public function inListItemScope( $tag ) { + return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() ); + } + + /** + * Determine if the stack has $tag in table scope. + * @param BalanceElement|array|string $tag + * @return bool + * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope + */ + public function inTableScope( $tag ) { + return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet ); + } + + /** + * Determine if the stack has $tag in select scope. + * @param BalanceElement|array|string $tag + * @return bool + * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope + */ + public function inSelectScope( $tag ) { + // Can't use inSpecificScope to implement this, since it involves + // *inverting* a set of tags. Implement manually. + foreach ( $this as $elt ) { + if ( $elt->isA( $tag ) ) { + return true; + } + if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) { + return false; + } + } + return false; + } + + /** + * Determine if the stack has $tag in a specific scope, $set. + * @param BalanceElement|array|string $tag + * @param BalanceElement|array|string $set + * @return bool + * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope + */ + public function inSpecificScope( $tag, $set ) { + foreach ( $this as $elt ) { + if ( $elt->isA( $tag ) ) { + return true; + } + if ( $elt->isA( $set ) ) { + return false; + } + } + return false; + } + + /** + * Generate implied end tags. + * @param string $butnot + * @param bool $thorough True if we should generate end tags thoroughly. + * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags + */ + public function generateImpliedEndTags( $butnot = null, $thorough = false ) { + $endTagSet = $thorough ? + BalanceSets::$thoroughImpliedEndTagsSet : + BalanceSets::$impliedEndTagsSet; + while ( $this->currentNode ) { + if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) { + break; + } + if ( !$this->currentNode->isA( $endTagSet ) ) { + break; + } + $this->pop(); + } + } + + /** + * Return the adjusted current node. + * @param string $fragmentContext + * @return string + */ + public function adjustedCurrentNode( $fragmentContext ) { + return ( $fragmentContext && count( $this->elements ) === 1 ) ? + $fragmentContext : $this->currentNode; + } + + /** + * Return an iterator over this stack which visits the current node + * first, and the root node last. + * @return \Iterator + */ + public function getIterator() { + return new ReverseArrayIterator( $this->elements ); + } + + /** + * Return the BalanceElement at the given position $idx, where + * position 0 represents the root element. + * @param int $idx + * @return BalanceElement + */ + public function node( $idx ) { + return $this->elements[ $idx ]; + } + + /** + * Replace the element at position $idx in the BalanceStack with $elt. + * @param int $idx + * @param BalanceElement $elt + */ + public function replaceAt( $idx, BalanceElement $elt ) { + Assert::precondition( + $this->elements[$idx]->parent !== 'flat', + 'Replaced element should not have already been flattened.' + ); + Assert::precondition( + $elt->parent !== 'flat', + 'New element should not have already been flattened.' + ); + $this->elements[$idx] = $elt; + if ( $idx === count( $this->elements ) - 1 ) { + $this->currentNode = $elt; + } + } + + /** + * Return the position of the given BalanceElement, set, or + * HTML tag name string in the BalanceStack. + * @param BalanceElement|array|string $tag + * @return int + */ + public function indexOf( $tag ) { + for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) { + if ( $this->elements[$i]->isA( $tag ) ) { + return $i; + } + } + return -1; + } + + /** + * Return the number of elements currently in the BalanceStack. + * @return int + */ + public function length() { + return count( $this->elements ); + } + + /** + * Remove the current node from the BalanceStack, flattening it + * in the process. + */ + public function pop() { + $elt = array_pop( $this->elements ); + if ( count( $this->elements ) ) { + $this->currentNode = $this->elements[ count( $this->elements ) - 1 ]; + } else { + $this->currentNode = null; + } + if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) { + $elt->flatten( $this->config ); + } + } + + /** + * Remove all nodes up to and including position $idx from the + * BalanceStack, flattening them in the process. + * @param int $idx + */ + public function popTo( $idx ) { + for ( $length = count( $this->elements ); $length > $idx; $length-- ) { + $this->pop(); + } + } + + /** + * Pop elements off the stack up to and including the first + * element with the specified HTML tagname (or matching the given + * set). + * @param BalanceElement|array|string $tag + */ + public function popTag( $tag ) { + while ( $this->currentNode ) { + if ( $this->currentNode->isA( $tag ) ) { + $this->pop(); + break; + } + $this->pop(); + } + } + + /** + * Pop elements off the stack *not including* the first element + * in the specified set. + * @param BalanceElement|array|string $set + */ + public function clearToContext( $set ) { + // Note that we don't loop to 0. Never pop the <html> elt off. + for ( $length = count( $this->elements ); $length > 1; $length-- ) { + if ( $this->currentNode->isA( $set ) ) { + break; + } + $this->pop(); + } + } + + /** + * Remove the given $elt from the BalanceStack, optionally + * flattening it in the process. + * @param BalanceElement $elt The element to remove. + * @param bool $flatten Whether to flatten the removed element. + */ + public function removeElement( BalanceElement $elt, $flatten = true ) { + Assert::parameter( + $elt->parent !== 'flat', + '$elt', + '$elt should not already have been flattened.' + ); + Assert::parameter( + $elt->parent->parent !== 'flat', + '$elt', + 'The parent of $elt should not already have been flattened.' + ); + $idx = array_search( $elt, $this->elements, true ); + Assert::parameter( $idx !== false, '$elt', 'must be in stack' ); + array_splice( $this->elements, $idx, 1 ); + if ( $idx === count( $this->elements ) ) { + $this->currentNode = $this->elements[$idx - 1]; + } + if ( $flatten ) { + // serialize $elt into its parent + // otherwise, it will eventually serialize when the parent + // is serialized, we just hold onto the memory for its + // tree of objects a little longer. + $elt->flatten( $this->config ); + } + Assert::postcondition( + array_search( $elt, $this->elements, true ) === false, + '$elt should no longer be in open elements stack' + ); + } + + /** + * Find $a in the BalanceStack and insert $b after it. + * @param BalanceElement $a + * @param BalanceElement $b + */ + public function insertAfter( BalanceElement $a, BalanceElement $b ) { + $idx = $this->indexOf( $a ); + Assert::parameter( $idx !== false, '$a', 'must be in stack' ); + if ( $idx === count( $this->elements ) - 1 ) { + array_push( $this->elements, $b ); + $this->currentNode = $b; + } else { + array_splice( $this->elements, $idx + 1, 0, [ $b ] ); + } + } + + // Fostering and adoption. + + /** + * Foster parent the given $elt in the stack of open elements. + * @param BalanceElement|string $elt + * @return BalanceElement|string + * + * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent + */ + private function fosterParent( $elt ) { + $lastTable = $this->indexOf( 'table' ); + $lastTemplate = $this->indexOf( 'template' ); + $parent = null; + $before = null; + + if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) { + $parent = $this->elements[$lastTemplate]; + } elseif ( $lastTable >= 0 ) { + $parent = $this->elements[$lastTable]->parent; + // Assume all tables have parents, since we're not running scripts! + Assert::invariant( + $parent !== null, "All tables should have parents" + ); + $before = $this->elements[$lastTable]; + } else { + $parent = $this->elements[0]; // the `html` element. + } + + if ( $this->config['tidyCompat'] ) { + if ( is_string( $elt ) ) { + // We're fostering text: do we need a p-wrapper? + if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) { + $this->insertHTMLElement( 'mw:p-wrap', [] ); + $this->insertText( $elt ); + return $elt; + } + } else { + // We're fostering an element; do we need to merge p-wrappers? + if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) { + $idx = $before ? + array_search( $before, $parent->children, true ) : + count( $parent->children ); + $after = $idx > 0 ? $parent->children[$idx - 1] : ''; + if ( + $after instanceof BalanceElement && + $after->isHtmlNamed( 'mw:p-wrap' ) + ) { + return $after; // Re-use existing p-wrapper. + } + } + } + } + + if ( $before ) { + $parent->insertBefore( $before, $elt ); + } else { + $parent->appendChild( $elt ); + } + return $elt; + } + + /** + * Run the "adoption agency algoritm" (AAA) for the given subject + * tag name. + * @param string $tag The subject tag name. + * @param BalanceActiveFormattingElements $afe The current + * active formatting elements list. + * @return true if the adoption agency algorithm "did something", false + * if more processing is required by the caller. + * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm + */ + public function adoptionAgency( $tag, $afe ) { + // If the current node is an HTML element whose tag name is subject, + // and the current node is not in the list of active formatting + // elements, then pop the current node off the stack of open + // elements and abort these steps. + if ( + $this->currentNode->isHtmlNamed( $tag ) && + !$afe->isInList( $this->currentNode ) + ) { + $this->pop(); + return true; // no more handling required + } + + // Outer loop: If outer loop counter is greater than or + // equal to eight, then abort these steps. + for ( $outer = 0; $outer < 8; $outer++ ) { + // Let the formatting element be the last element in the list + // of active formatting elements that: is between the end of + // the list and the last scope marker in the list, if any, or + // the start of the list otherwise, and has the same tag name + // as the token. + $fmtElt = $afe->findElementByTag( $tag ); + + // If there is no such node, then abort these steps and instead + // act as described in the "any other end tag" entry below. + if ( !$fmtElt ) { + return false; // false means handle by the default case + } + + // Otherwise, if there is such a node, but that node is not in + // the stack of open elements, then this is a parse error; + // remove the element from the list, and abort these steps. + $index = $this->indexOf( $fmtElt ); + if ( $index < 0 ) { + $afe->remove( $fmtElt ); + return true; // true means no more handling required + } + + // Otherwise, if there is such a node, and that node is also in + // the stack of open elements, but the element is not in scope, + // then this is a parse error; ignore the token, and abort + // these steps. + if ( !$this->inScope( $fmtElt ) ) { + return true; + } + + // Let the furthest block be the topmost node in the stack of + // open elements that is lower in the stack than the formatting + // element, and is an element in the special category. There + // might not be one. + $furthestBlock = null; + $furthestBlockIndex = -1; + $stackLength = $this->length(); + for ( $i = $index + 1; $i < $stackLength; $i++ ) { + if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) { + $furthestBlock = $this->node( $i ); + $furthestBlockIndex = $i; + break; + } + } + + // If there is no furthest block, then the UA must skip the + // subsequent steps and instead just pop all the nodes from the + // bottom of the stack of open elements, from the current node + // up to and including the formatting element, and remove the + // formatting element from the list of active formatting + // elements. + if ( !$furthestBlock ) { + $this->popTag( $fmtElt ); + $afe->remove( $fmtElt ); + return true; + } + + // Let the common ancestor be the element immediately above + // the formatting element in the stack of open elements. + $ancestor = $this->node( $index - 1 ); + + // Let a bookmark note the position of the formatting + // element in the list of active formatting elements + // relative to the elements on either side of it in the + // list. + $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] ); + $afe->insertAfter( $fmtElt, $BOOKMARK ); + + // Let node and last node be the furthest block. + $node = $furthestBlock; + $lastNode = $furthestBlock; + $nodeIndex = $furthestBlockIndex; + $isAFE = false; + + // Inner loop + for ( $inner = 1; true; $inner++ ) { + // Let node be the element immediately above node in + // the stack of open elements, or if node is no longer + // in the stack of open elements (e.g. because it got + // removed by this algorithm), the element that was + // immediately above node in the stack of open elements + // before node was removed. + $node = $this->node( --$nodeIndex ); + + // If node is the formatting element, then go + // to the next step in the overall algorithm. + if ( $node === $fmtElt ) break; + + // If the inner loop counter is greater than three and node + // is in the list of active formatting elements, then remove + // node from the list of active formatting elements. + $isAFE = $afe->isInList( $node ); + if ( $inner > 3 && $isAFE ) { + $afe->remove( $node ); + $isAFE = false; + } + + // If node is not in the list of active formatting + // elements, then remove node from the stack of open + // elements and then go back to the step labeled inner + // loop. + if ( !$isAFE ) { + // Don't flatten here, since we're about to relocate + // parts of this $node. + $this->removeElement( $node, false ); + continue; + } + + // Create an element for the token for which the + // element node was created with common ancestor as + // the intended parent, replace the entry for node + // in the list of active formatting elements with an + // entry for the new element, replace the entry for + // node in the stack of open elements with an entry for + // the new element, and let node be the new element. + $newElt = new BalanceElement( + $node->namespaceURI, $node->localName, $node->attribs ); + $afe->replace( $node, $newElt ); + $this->replaceAt( $nodeIndex, $newElt ); + $node = $newElt; + + // If last node is the furthest block, then move the + // aforementioned bookmark to be immediately after the + // new node in the list of active formatting elements. + if ( $lastNode === $furthestBlock ) { + $afe->remove( $BOOKMARK ); + $afe->insertAfter( $newElt, $BOOKMARK ); + } + + // Insert last node into node, first removing it from + // its previous parent node if any. + $node->appendChild( $lastNode ); + + // Let last node be node. + $lastNode = $node; + } + + // If the common ancestor node is a table, tbody, tfoot, + // thead, or tr element, then, foster parent whatever last + // node ended up being in the previous step, first removing + // it from its previous parent node if any. + if ( + $this->fosterParentMode && + $ancestor->isA( BalanceSets::$tableSectionRowSet ) + ) { + $this->fosterParent( $lastNode ); + } else { + // Otherwise, append whatever last node ended up being in + // the previous step to the common ancestor node, first + // removing it from its previous parent node if any. + $ancestor->appendChild( $lastNode ); + } + + // Create an element for the token for which the + // formatting element was created, with furthest block + // as the intended parent. + $newElt2 = new BalanceElement( + $fmtElt->namespaceURI, $fmtElt->localName, $fmtElt->attribs ); + + // Take all of the child nodes of the furthest block and + // append them to the element created in the last step. + $newElt2->adoptChildren( $furthestBlock ); + + // Append that new element to the furthest block. + $furthestBlock->appendChild( $newElt2 ); + + // Remove the formatting element from the list of active + // formatting elements, and insert the new element into the + // list of active formatting elements at the position of + // the aforementioned bookmark. + $afe->remove( $fmtElt ); + $afe->replace( $BOOKMARK, $newElt2 ); + + // Remove the formatting element from the stack of open + // elements, and insert the new element into the stack of + // open elements immediately below the position of the + // furthest block in that stack. + $this->removeElement( $fmtElt ); + $this->insertAfter( $furthestBlock, $newElt2 ); + } + + return true; + } + + /** + * Return the contents of the open elements stack as a string for + * debugging. + * @return string + */ + public function __toString() { + $r = []; + foreach ( $this->elements as $elt ) { + array_push( $r, $elt->localName ); + } + return implode( ' ', $r ); + } +} + +/** + * A pseudo-element used as a marker in the list of active formatting elements + * + * @ingroup Parser + * @since 1.27 + */ +class BalanceMarker { + public $nextAFE; + public $prevAFE; +} + +/** + * The list of active formatting elements, which is used to handle + * mis-nested formatting element tags in the HTML5 tree builder + * specification. + * + * @ingroup Parser + * @since 1.27 + * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements + */ +class BalanceActiveFormattingElements { + /** The last (most recent) element in the list */ + private $tail; + + /** The first (least recent) element in the list */ + private $head; + + /** + * An array of arrays representing the population of elements in each bucket + * according to the Noah's Ark clause. The outer array is stack-like, with each + * integer-indexed element representing a segment of the list, bounded by + * markers. The first element represents the segment of the list before the + * first marker. + * + * The inner arrays are indexed by "Noah key", which is a string which uniquely + * identifies each bucket according to the rules in the spec. The value in + * the inner array is the first (least recently inserted) element in the bucket, + * and subsequent members of the bucket can be found by iterating through the + * singly-linked list via $node->nextNoah. + * + * This is optimised for the most common case of inserting into a bucket + * with zero members, and deleting a bucket containing one member. In the + * worst case, iteration through the list is still O(1) in the document + * size, since each bucket can have at most 3 members. + */ + private $noahTableStack = [ [] ]; + + public function __destruct() { + $next = null; + for ( $node = $this->head; $node; $node = $next ) { + $next = $node->nextAFE; + $node->prevAFE = $node->nextAFE = $node->nextNoah = null; + } + $this->head = $this->tail = $this->noahTableStack = null; + } + + public function insertMarker() { + $elt = new BalanceMarker; + if ( $this->tail ) { + $this->tail->nextAFE = $elt; + $elt->prevAFE = $this->tail; + } else { + $this->head = $elt; + } + $this->tail = $elt; + $this->noahTableStack[] = []; + } + + /** + * Follow the steps required when the spec requires us to "push onto the + * list of active formatting elements". + * @param BalanceElement $elt + */ + public function push( BalanceElement $elt ) { + // Must not be in the list already + if ( $elt->prevAFE !== null || $this->head === $elt ) { + throw new ParameterAssertionException( '$elt', + 'Cannot insert a node into the AFE list twice' ); + } + + // "Noah's Ark clause" -- if there are already three copies of + // this element before we encounter a marker, then drop the last + // one. + $noahKey = $elt->getNoahKey(); + $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ]; + if ( !isset( $table[$noahKey] ) ) { + $table[$noahKey] = $elt; + } else { + $count = 1; + $head = $tail = $table[$noahKey]; + while ( $tail->nextNoah ) { + $tail = $tail->nextNoah; + $count++; + } + if ( $count >= 3 ) { + $this->remove( $head ); + } + $tail->nextNoah = $elt; + } + // Add to the main AFE list + if ( $this->tail ) { + $this->tail->nextAFE = $elt; + $elt->prevAFE = $this->tail; + } else { + $this->head = $elt; + } + $this->tail = $elt; + } + + /** + * Follow the steps required when the spec asks us to "clear the list of + * active formatting elements up to the last marker". + */ + public function clearToMarker() { + // Iterate back through the list starting from the tail + $tail = $this->tail; + while ( $tail && !( $tail instanceof BalanceMarker ) ) { + // Unlink the element + $prev = $tail->prevAFE; + $tail->prevAFE = null; + if ( $prev ) { + $prev->nextAFE = null; + } + $tail->nextNoah = null; + $tail = $prev; + } + // If we finished on a marker, unlink it and pop it off the Noah table stack + if ( $tail ) { + $prev = $tail->prevAFE; + if ( $prev ) { + $prev->nextAFE = null; + } + $tail = $prev; + array_pop( $this->noahTableStack ); + } else { + // No marker: wipe the top-level Noah table (which is the only one) + $this->noahTableStack[0] = []; + } + // If we removed all the elements, clear the head pointer + if ( !$tail ) { + $this->head = null; + } + $this->tail = $tail; + } + + /** + * Find and return the last element with the specified tag between the + * end of the list and the last marker on the list. + * Used when parsing <a> "in body mode". + * @param string $tag + * @return null|Node + */ + public function findElementByTag( $tag ) { + $elt = $this->tail; + while ( $elt && !( $elt instanceof BalanceMarker ) ) { + if ( $elt->localName === $tag ) { + return $elt; + } + $elt = $elt->prevAFE; + } + return null; + } + + /** + * Determine whether an element is in the list of formatting elements. + * @param BalanceElement $elt + * @return bool + */ + public function isInList( BalanceElement $elt ) { + return $this->head === $elt || $elt->prevAFE; + } + + /** + * Find the element $elt in the list and remove it. + * Used when parsing <a> in body mode. + * + * @param BalanceElement $elt + */ + public function remove( BalanceElement $elt ) { + if ( $this->head !== $elt && !$elt->prevAFE ) { + throw new ParameterAssertionException( '$elt', + "Attempted to remove an element which is not in the AFE list" ); + } + // Update head and tail pointers + if ( $this->head === $elt ) { + $this->head = $elt->nextAFE; + } + if ( $this->tail === $elt ) { + $this->tail = $elt->prevAFE; + } + // Update previous element + if ( $elt->prevAFE ) { + $elt->prevAFE->nextAFE = $elt->nextAFE; + } + // Update next element + if ( $elt->nextAFE ) { + $elt->nextAFE->prevAFE = $elt->prevAFE; + } + // Clear pointers so that isInList() etc. will work + $elt->prevAFE = $elt->nextAFE = null; + // Update Noah list + $this->removeFromNoahList( $elt ); + } + + private function addToNoahList( BalanceElement $elt ) { + $noahKey = $elt->getNoahKey(); + $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ]; + if ( !isset( $table[$noahKey] ) ) { + $table[$noahKey] = $elt; + } else { + $tail = $table[$noahKey]; + while ( $tail->nextNoah ) { + $tail = $tail->nextNoah; + } + $tail->nextNoah = $elt; + } + } + + private function removeFromNoahList( BalanceElement $elt ) { + $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ]; + $key = $elt->getNoahKey(); + $noahElt = $table[$key]; + if ( $noahElt === $elt ) { + if ( $noahElt->nextNoah ) { + $table[$key] = $noahElt->nextNoah; + $noahElt->nextNoah = null; + } else { + unset( $table[$key] ); + } + } else { + do { + $prevNoahElt = $noahElt; + $noahElt = $prevNoahElt->nextNoah; + if ( $noahElt === $elt ) { + // Found it, unlink + $prevNoahElt->nextNoah = $elt->nextNoah; + $elt->nextNoah = null; + break; + } + } while ( $noahElt ); + } + } + + /** + * Find element $a in the list and replace it with element $b + * + * @param BalanceElement $a + * @param BalanceElement $b + */ + public function replace( BalanceElement $a, BalanceElement $b ) { + if ( $this->head !== $a && !$a->prevAFE ) { + throw new ParameterAssertionException( '$a', + "Attempted to replace an element which is not in the AFE list" ); + } + // Update head and tail pointers + if ( $this->head === $a ) { + $this->head = $b; + } + if ( $this->tail === $a ) { + $this->tail = $b; + } + // Update previous element + if ( $a->prevAFE ) { + $a->prevAFE->nextAFE = $b; + } + // Update next element + if ( $a->nextAFE ) { + $a->nextAFE->prevAFE = $b; + } + $b->prevAFE = $a->prevAFE; + $b->nextAFE = $a->nextAFE; + $a->nextAFE = $a->prevAFE = null; + // Update Noah list + $this->removeFromNoahList( $a ); + $this->addToNoahList( $b ); + } + + /** + * Find $a in the list and insert $b after it. + + * @param BalanceElement $a + * @param BalanceElement $b + */ + public function insertAfter( BalanceElement $a, BalanceElement $b ) { + if ( $this->head !== $a && !$a->prevAFE ) { + throw new ParameterAssertionException( '$a', + "Attempted to insert after an element which is not in the AFE list" ); + } + if ( $this->tail === $a ) { + $this->tail = $b; + } + if ( $a->nextAFE ) { + $a->nextAFE->prevAFE = $b; + } + $b->nextAFE = $a->nextAFE; + $b->prevAFE = $a; + $a->nextAFE = $b; + $this->addToNoahList( $b ); + } + + /** + * Reconstruct the active formatting elements. + * @param BalanceStack $stack The open elements stack + * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements + */ + public function reconstruct( $stack ) { + $entry = $this->tail; + // If there are no entries in the list of active formatting elements, + // then there is nothing to reconstruct + if ( !$entry ) { + return; + } + // If the last is a marker, do nothing. + if ( $entry instanceof BalanceMarker ) { + return; + } + // Or if it is an open element, do nothing. + if ( $stack->indexOf( $entry ) >= 0 ) { + return; + } + + // Loop backward through the list until we find a marker or an + // open element + $foundIt = false; + while ( $entry->prevAFE ) { + $entry = $entry->prevAFE; + if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) { + $foundIt = true; + break; + } + } + + // Now loop forward, starting from the element after the current one (or + // the first element if we didn't find a marker or open element), + // recreating formatting elements and pushing them back onto the list + // of open elements. + if ( $foundIt ) { + $entry = $entry->nextAFE; + } + do { + $newElement = $stack->insertHTMLElement( + $entry->localName, + $entry->attribs ); + $this->replace( $entry, $newElement ); + $entry = $newElement->nextAFE; + } while ( $entry ); + } + + /** + * Get a string representation of the AFE list, for debugging + */ + public function __toString() { + $prev = null; + $s = ''; + for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) { + if ( $node instanceof BalanceMarker ) { + $s .= "MARKER\n"; + continue; + } + $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 ); + if ( $node->nextNoah ) { + $s .= " (noah sibling: {$node->nextNoah->localName}#" . + substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) . + ')'; + } + if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) { + $s .= " (reverse link is wrong!)"; + } + $s .= "\n"; + } + if ( $prev !== $this->tail ) { + $s .= "(tail pointer is wrong!)\n"; + } + return $s; + } +} + +/** + * An implementation of the tree building portion of the HTML5 parsing + * spec. + * + * This is used to balance and tidy output so that the result can + * always be cleanly serialized/deserialized by an HTML5 parser. It + * does *not* guarantee "conforming" output -- the HTML5 spec contains + * a number of constraints which are not enforced by the HTML5 parsing + * process. But the result will be free of gross errors: misnested or + * unclosed tags, for example, and will be unchanged by spec-complient + * parsing followed by serialization. + * + * The tree building stage is structured as a state machine. + * When comparing the implementation to + * https://www.w3.org/TR/html5/syntax.html#tree-construction + * note that each state is implemented as a function with a + * name ending in `Mode` (because the HTML spec refers to them + * as insertion modes). The current insertion mode is held by + * the $parseMode property. + * + * The following simplifications have been made: + * - We handle body content only (ie, we start `in body`.) + * - The document is never in "quirks mode". + * - All occurrences of < and > have been entity escaped, so we + * can parse tags by simply splitting on those two characters. + * (This also simplifies the handling of < inside <textarea>.) + * The character < must not appear inside comments. + * Similarly, all attributes have been "cleaned" and are double-quoted + * and escaped. + * - All null characters are assumed to have been removed. + * - The following elements are disallowed: <html>, <head>, <body>, <frameset>, + * <frame>, <plaintext>, <xmp>, <iframe>, + * <noembed>, <noscript>, <script>, <title>. As a result, + * further simplifications can be made: + * - `frameset-ok` is not tracked. + * - `head element pointer` is not tracked (but presumed non-null) + * - Tokenizer has only a single mode. (<textarea> wants RCDATA and + * <style>/<noframes> want RAWTEXT modes which we only loosely emulate.) + * + * We generally mark places where we omit cases from the spec due to + * disallowed elements with a comment: `// OMITTED: <element-name>`. + * + * The HTML spec keeps a flag during the parsing process to track + * whether or not a "parse error" has been encountered. We don't + * bother to track that flag, we just implement the error-handling + * process as specified. + * + * @ingroup Parser + * @since 1.27 + * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction + */ +class Balancer { + private $parseMode; + /** @var \Iterator */ + private $bitsIterator; + private $allowedHtmlElements; + /** @var BalanceActiveFormattingElements */ + private $afe; + /** @var BalanceStack */ + private $stack; + private $strict; + private $allowComments; + private $config; + + private $textIntegrationMode; + private $pendingTableText; + private $originalInsertionMode; + private $fragmentContext; + private $formElementPointer; + private $ignoreLinefeed; + private $inRCDATA; + private $inRAWTEXT; + + /** @var callable|null */ + private $processingCallback; + /** @var array */ + private $processingArgs; + + /** + * Valid HTML5 comments. + * Regex borrowed from Tim Starling's "remex-html" project. + */ + const VALID_COMMENT_REGEX = "~ !-- + ( # 1. Comment match detector + > | -> | # Invalid short close + ( # 2. Comment contents + (?: + (?! --> ) + (?! --!> ) + (?! --! \z ) + (?! -- \z ) + (?! - \z ) + . + )*+ + ) + ( # 3. Comment close + --> | # Normal close + --!> | # Comment end bang + ( # 4. Indicate matches requiring EOF + --! | # EOF in comment end bang state + -- | # EOF in comment end state + - | # EOF in comment end dash state + (?#nothing) # EOF in comment state + ) + ) + ) + ([^<]*) \z # 5. Non-tag text after the comment + ~xs"; + + /** + * Create a new Balancer. + * @param array $config Balancer configuration. Includes: + * 'strict' : boolean, defaults to false. + * When true, enforces syntactic constraints on input: + * all non-tag '<' must be escaped, all attributes must be + * separated by a single space and double-quoted. This is + * consistent with the output of the Sanitizer. + * 'allowedHtmlElements' : array, defaults to null. + * When present, the keys of this associative array give + * the acceptable HTML tag names. When not present, no + * tag sanitization is done. + * 'tidyCompat' : boolean, defaults to false. + * When true, the serialization algorithm is tweaked to + * provide historical compatibility with the old "tidy" + * program: <p>-wrapping is done to the children of + * <body> and <blockquote> elements, and empty elements + * are removed. The <pre>/<listing>/<textarea> serialization + * is also tweaked to allow lossless round trips. + * (See: https://github.com/whatwg/html/issues/944) + * 'allowComments': boolean, defaults to true. + * When true, allows HTML comments in the input. + * The Sanitizer generally strips all comments, so if you + * are running on sanitized output you can set this to + * false to get a bit more performance. + */ + public function __construct( array $config = [] ) { + $this->config = $config = $config + [ + 'strict' => false, + 'allowedHtmlElements' => null, + 'tidyCompat' => false, + 'allowComments' => true, + ]; + $this->allowedHtmlElements = $config['allowedHtmlElements']; + $this->strict = $config['strict']; + $this->allowComments = $config['allowComments']; + if ( $this->allowedHtmlElements !== null ) { + // Sanity check! + $bad = array_uintersect_assoc( + $this->allowedHtmlElements, + BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE], + function ( $a, $b ) { + // Ignore the values (just intersect the keys) by saying + // all values are equal to each other. + return 0; + } + ); + if ( count( $bad ) > 0 ) { + $badstr = implode( ',', array_keys( $bad ) ); + throw new ParameterAssertionException( + '$config', + 'Balance attempted with sanitization including ' . + "unsupported elements: {$badstr}" + ); + } + } + } + + /** + * Return a balanced HTML string for the HTML fragment given by $text, + * subject to the caveats listed in the class description. The result + * will typically be idempotent -- that is, rebalancing the output + * would result in no change. + * + * @param string $text The markup to be balanced + * @param callable $processingCallback Callback to do any variable or + * parameter replacements in HTML attributes values + * @param array|bool $processingArgs Arguments for the processing callback + * @return string The balanced markup + */ + public function balance( $text, $processingCallback = null, $processingArgs = [] ) { + $this->parseMode = 'inBodyMode'; + $this->bitsIterator = new ExplodeIterator( '<', $text ); + $this->afe = new BalanceActiveFormattingElements(); + $this->stack = new BalanceStack( $this->config ); + $this->processingCallback = $processingCallback; + $this->processingArgs = $processingArgs; + + $this->textIntegrationMode = + $this->ignoreLinefeed = + $this->inRCDATA = + $this->inRAWTEXT = false; + + // The stack is constructed with an <html> element already on it. + // Set this up as a fragment parsed with <body> as the context. + $this->fragmentContext = + new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] ); + $this->resetInsertionMode(); + $this->formElementPointer = null; + for ( $e = $this->fragmentContext; $e != null; $e = $e->parent ) { + if ( $e->isHtmlNamed( 'form' ) ) { + $this->formElementPointer = $e; + break; + } + } + + // First element is text not tag + $x = $this->bitsIterator->current(); + $this->bitsIterator->next(); + $this->insertToken( 'text', str_replace( '>', '>', $x ) ); + // Now process each tag. + while ( $this->bitsIterator->valid() ) { + $this->advance(); + } + $this->insertToken( 'eof', null ); + $result = $this->stack->getOutput(); + // Free memory before returning. + $this->bitsIterator = null; + $this->afe = null; + $this->stack = null; + $this->fragmentContext = null; + $this->formElementPointer = null; + return $result; + } + + /** + * Pass a token to the tree builder. The $token will be one of the + * strings "tag", "endtag", or "text". + */ + private function insertToken( $token, $value, $attribs = null, $selfClose = false ) { + // validate tags against $unsupportedSet + if ( $token === 'tag' || $token === 'endtag' ) { + if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) { + // As described in "simplifications" above, these tags are + // not supported in the balancer. + Assert::invariant( + !$this->strict, + "Unsupported $token <$value> found." + ); + return false; + } + } elseif ( $token === 'text' && $value === '' ) { + // Don't actually inject the empty string as a text token. + return true; + } + // Support pre/listing/textarea by suppressing initial linefeed + if ( $this->ignoreLinefeed ) { + $this->ignoreLinefeed = false; + if ( $token === 'text' ) { + if ( $value[0] === "\n" ) { + if ( $value === "\n" ) { + // Nothing would be left, don't inject the empty string. + return true; + } + $value = substr( $value, 1 ); + } + } + } + // Some hoops we have to jump through + $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext ); + + // The spec calls this the "tree construction dispatcher". + $isForeign = true; + if ( + $this->stack->length() === 0 || + $adjusted->isHtml() || + $token === 'eof' + ) { + $isForeign = false; + } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) { + if ( $token === 'text' ) { + $isForeign = false; + } elseif ( + $token === 'tag' && + $value !== 'mglyph' && $value !== 'malignmark' + ) { + $isForeign = false; + } + } elseif ( + $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE && + $adjusted->localName === 'annotation-xml' && + $token === 'tag' && $value === 'svg' + ) { + $isForeign = false; + } elseif ( + $adjusted->isHtmlIntegrationPoint() && + ( $token === 'tag' || $token === 'text' ) + ) { + $isForeign = false; + } + if ( $isForeign ) { + return $this->insertForeignToken( $token, $value, $attribs, $selfClose ); + } else { + $func = $this->parseMode; + return $this->$func( $token, $value, $attribs, $selfClose ); + } + } + + private function insertForeignToken( $token, $value, $attribs = null, $selfClose = false ) { + if ( $token === 'text' ) { + $this->stack->insertText( $value ); + return true; + } elseif ( $token === 'comment' ) { + $this->stack->insertComment( $value ); + return true; + } elseif ( $token === 'tag' ) { + switch ( $value ) { + case 'font': + if ( isset( $attribs['color'] ) + || isset( $attribs['face'] ) + || isset( $attribs['size'] ) + ) { + break; + } + // otherwise, fall through + case 'b': + case 'big': + case 'blockquote': + case 'body': + case 'br': + case 'center': + case 'code': + case 'dd': + case 'div': + case 'dl': + case 'dt': + case 'em': + case 'embed': + case 'h1': + case 'h2': + case 'h3': + case 'h4': + case 'h5': + case 'h6': + case 'head': + case 'hr': + case 'i': + case 'img': + case 'li': + case 'listing': + case 'menu': + case 'meta': + case 'nobr': + case 'ol': + case 'p': + case 'pre': + case 'ruby': + case 's': + case 'small': + case 'span': + case 'strong': + case 'strike': + case 'sub': + case 'sup': + case 'table': + case 'tt': + case 'u': + case 'ul': + case 'var': + if ( $this->fragmentContext ) { + break; + } + while ( true ) { + $this->stack->pop(); + $node = $this->stack->currentNode; + if ( + $node->isMathmlTextIntegrationPoint() || + $node->isHtmlIntegrationPoint() || + $node->isHtml() + ) { + break; + } + } + return $this->insertToken( $token, $value, $attribs, $selfClose ); + } + // "Any other start tag" + $adjusted = ( $this->fragmentContext && $this->stack->length() === 1 ) ? + $this->fragmentContext : $this->stack->currentNode; + $this->stack->insertForeignElement( + $adjusted->namespaceURI, $value, $attribs + ); + if ( $selfClose ) { + $this->stack->pop(); + } + return true; + } elseif ( $token === 'endtag' ) { + $first = true; + foreach ( $this->stack as $i => $node ) { + if ( $node->isHtml() && !$first ) { + // process the end tag as HTML + $func = $this->parseMode; + return $this->$func( $token, $value, $attribs, $selfClose ); + } elseif ( $i === 0 ) { + return true; + } elseif ( $node->localName === $value ) { + $this->stack->popTag( $node ); + return true; + } + $first = false; + } + } + } + + /** + * Grab the next "token" from $bitsIterator. This is either a open/close + * tag or text or a comment, depending on whether the Sanitizer approves. + */ + private function advance() { + $x = $this->bitsIterator->current(); + $this->bitsIterator->next(); + $regs = []; + // Handle comments. These won't be generated by mediawiki (they + // are stripped in the Sanitizer) but may be generated by extensions. + if ( + $this->allowComments && + !( $this->inRCDATA || $this->inRAWTEXT ) && + preg_match( self::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) && + // verify EOF condition where necessary + ( $regs[4][1] < 0 || !$this->bitsIterator->valid() ) + ) { + $contents = $regs[2][0]; + $rest = $regs[5][0]; + $this->insertToken( 'comment', $contents ); + $this->insertToken( 'text', str_replace( '>', '>', $rest ) ); + return; + } + // $slash: Does the current element start with a '/'? + // $t: Current element name + // $attribStr: String between element name and > + // $brace: Ending '>' or '/>' + // $rest: Everything until the next element from the $bitsIterator + if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) { + list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs; + $t = strtolower( $t ); + if ( $this->strict ) { + // Verify that attributes are all properly double-quoted + Assert::invariant( + preg_match( + '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr + ), + "Bad attribute string found" + ); + } + } else { + Assert::invariant( + !$this->strict, "< found which does not start a valid tag" + ); + $slash = $t = $attribStr = $brace = $rest = null; + } + $goodTag = $t; + if ( $this->inRCDATA ) { + if ( $slash && $t === $this->inRCDATA ) { + $this->inRCDATA = false; + } else { + // No tags allowed; this emulates the "rcdata" tokenizer mode. + $goodTag = false; + } + } + if ( $this->inRAWTEXT ) { + if ( $slash && $t === $this->inRAWTEXT ) { + $this->inRAWTEXT = false; + } else { + // No tags allowed, no entity-escaping done. + $goodTag = false; + } + } + $sanitize = $this->allowedHtmlElements !== null; + if ( $sanitize ) { + $goodTag = $t && isset( $this->allowedHtmlElements[$t] ); + } + if ( $goodTag ) { + if ( is_callable( $this->processingCallback ) ) { + call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] ); + } + if ( $sanitize ) { + $goodTag = Sanitizer::validateTag( $attribStr, $t ); + } + } + if ( $goodTag ) { + if ( $sanitize ) { + $attribs = Sanitizer::decodeTagAttributes( $attribStr ); + $attribs = Sanitizer::validateTagAttributes( $attribs, $t ); + } else { + $attribs = Sanitizer::decodeTagAttributes( $attribStr ); + } + $goodTag = $this->insertToken( + $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>' + ); + } + if ( $goodTag ) { + $rest = str_replace( '>', '>', $rest ); + $this->insertToken( 'text', str_replace( '>', '>', $rest ) ); + } elseif ( $this->inRAWTEXT ) { + $this->insertToken( 'text', "<$x" ); + } else { + // bad tag; serialize entire thing as text. + $this->insertToken( 'text', '<' . str_replace( '>', '>', $x ) ); + } + } + + private function switchMode( $mode ) { + Assert::parameter( + substr( $mode, -4 ) === 'Mode', '$mode', 'should end in Mode' + ); + $oldMode = $this->parseMode; + $this->parseMode = $mode; + return $oldMode; + } + + private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfClose ) { + $this->switchMode( $mode ); + return $this->insertToken( $token, $value, $attribs, $selfClose ); + } + + private function resetInsertionMode() { + $last = false; + foreach ( $this->stack as $i => $node ) { + if ( $i === 0 ) { + $last = true; + if ( $this->fragmentContext ) { + $node = $this->fragmentContext; + } + } + if ( $node->isHtml() ) { + switch ( $node->localName ) { + case 'select': + $stackLength = $this->stack->length(); + for ( $j = $i + 1; $j < $stackLength - 1; $j++ ) { + $ancestor = $this->stack->node( $stackLength - $j - 1 ); + if ( $ancestor->isHtmlNamed( 'template' ) ) { + break; + } + if ( $ancestor->isHtmlNamed( 'table' ) ) { + $this->switchMode( 'inSelectInTableMode' ); + return; + } + } + $this->switchMode( 'inSelectMode' ); + return; + case 'tr': + $this->switchMode( 'inRowMode' ); + return; + case 'tbody': + case 'tfoot': + case 'thead': + $this->switchMode( 'inTableBodyMode' ); + return; + case 'caption': + $this->switchMode( 'inCaptionMode' ); + return; + case 'colgroup': + $this->switchMode( 'inColumnGroupMode' ); + return; + case 'table': + $this->switchMode( 'inTableMode' ); + return; + case 'template': + $this->switchMode( + array_slice( $this->templateInsertionModes, -1 )[0] + ); + return; + case 'body': + $this->switchMode( 'inBodyMode' ); + return; + // OMITTED: <frameset> + // OMITTED: <html> + // OMITTED: <head> + default: + if ( !$last ) { + // OMITTED: <head> + if ( $node->isA( BalanceSets::$tableCellSet ) ) { + $this->switchMode( 'inCellMode' ); + return; + } + } + } + } + if ( $last ) { + $this->switchMode( 'inBodyMode' ); + return; + } + } + } + + private function stopParsing() { + // Most of the spec methods are inapplicable, other than step 2: + // "pop all the nodes off the stack of open elements". + // We're going to keep the top-most <html> element on the stack, though. + + // Clear the AFE list first, otherwise the element objects will stay live + // during serialization, potentially using O(N^2) memory. Note that + // popping the stack will never result in reconstructing the active + // formatting elements. + $this->afe = null; + $this->stack->popTo( 1 ); + } + + private function parseRawText( $value, $attribs = null ) { + $this->stack->insertHTMLElement( $value, $attribs ); + $this->inRAWTEXT = $value; + $this->originalInsertionMode = $this->switchMode( 'inTextMode' ); + return true; + } + + private function inTextMode( $token, $value, $attribs = null, $selfClose = false ) { + if ( $token === 'text' ) { + $this->stack->insertText( $value ); + return true; + } elseif ( $token === 'eof' ) { + $this->stack->pop(); + return $this->switchModeAndReprocess( + $this->originalInsertionMode, $token, $value, $attribs, $selfClose + ); + } elseif ( $token === 'endtag' ) { + $this->stack->pop(); + $this->switchMode( $this->originalInsertionMode ); + return true; + } + return true; + } + + private function inHeadMode( $token, $value, $attribs = null, $selfClose = false ) { + if ( $token === 'text' ) { + if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) { + $this->stack->insertText( $matches[0] ); + $value = substr( $value, strlen( $matches[0] ) ); + } + if ( strlen( $value ) === 0 ) { + return true; // All text handled. + } + // Fall through to handle non-whitespace below. + } elseif ( $token === 'tag' ) { + switch ( $value ) { + case 'meta': + // OMITTED: in a full HTML parser, this might change the encoding. + // falls through + // OMITTED: <html> + case 'base': + case 'basefont': + case 'bgsound': + case 'link': + $this->stack->insertHTMLElement( $value, $attribs ); + $this->stack->pop(); + return true; + // OMITTED: <title> + // OMITTED: <noscript> + case 'noframes': + case 'style': + return $this->parseRawText( $value, $attribs ); + // OMITTED: <script> + case 'template': + $this->stack->insertHTMLElement( $value, $attribs ); + $this->afe->insertMarker(); + // OMITTED: frameset_ok + $this->switchMode( 'inTemplateMode' ); + $this->templateInsertionModes[] = $this->parseMode; + return true; + // OMITTED: <head> + } + } elseif ( $token === 'endtag' ) { + switch ( $value ) { + // OMITTED: <head> + // OMITTED: <body> + // OMITTED: <html> + case 'br': + break; // handle at the bottom of the function + case 'template': + if ( $this->stack->indexOf( $value ) < 0 ) { + return true; // Ignore the token. + } + $this->stack->generateImpliedEndTags( null, true /* thorough */ ); + $this->stack->popTag( $value ); + $this->afe->clearToMarker(); + array_pop( $this->templateInsertionModes ); + $this->resetInsertionMode(); + return true; + default: + // ignore any other end tag + return true; + } + } elseif ( $token === 'comment' ) { + $this->stack->insertComment( $value ); + return true; + } + + // If not handled above + $this->inHeadMode( 'endtag', 'head' ); // synthetic </head> + // Then redo this one + return $this->insertToken( $token, $value, $attribs, $selfClose ); + } + + private function inBodyMode( $token, $value, $attribs = null, $selfClose = false ) { + if ( $token === 'text' ) { + $this->afe->reconstruct( $this->stack ); + $this->stack->insertText( $value ); + return true; + } elseif ( $token === 'eof' ) { + if ( !empty( $this->templateInsertionModes ) ) { + return $this->inTemplateMode( $token, $value, $attribs, $selfClose ); + } + $this->stopParsing(); + return true; + } elseif ( $token === 'tag' ) { + switch ( $value ) { + // OMITTED: <html> + case 'base': + case 'basefont': + case 'bgsound': + case 'link': + case 'meta': + case 'noframes': + // OMITTED: <script> + case 'style': + case 'template': + // OMITTED: <title> + return $this->inHeadMode( $token, $value, $attribs, $selfClose ); + // OMITTED: <body> + // OMITTED: <frameset> + + case 'address': + case 'article': + case 'aside': + case 'blockquote': + case 'center': + case 'details': + case 'dialog': + case 'dir': + case 'div': + case 'dl': + case 'fieldset': + case 'figcaption': + case 'figure': + case 'footer': + case 'header': + case 'hgroup': + case 'main': + case 'nav': + case 'ol': + case 'p': + case 'section': + case 'summary': + case 'ul': + if ( $this->stack->inButtonScope( 'p' ) ) { + $this->inBodyMode( 'endtag', 'p' ); + } + $this->stack->insertHTMLElement( $value, $attribs ); + return true; + + case 'menu': + if ( $this->stack->inButtonScope( "p" ) ) { + $this->inBodyMode( 'endtag', 'p' ); + } + if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) { + $this->stack->pop(); + } + $this->stack->insertHTMLElement( $value, $attribs ); + return true; + + case 'h1': + case 'h2': + case 'h3': + case 'h4': + case 'h5': + case 'h6': + if ( $this->stack->inButtonScope( 'p' ) ) { + $this->inBodyMode( 'endtag', 'p' ); + } + if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) { + $this->stack->pop(); + } + $this->stack->insertHTMLElement( $value, $attribs ); + return true; + + case 'pre': + case 'listing': + if ( $this->stack->inButtonScope( 'p' ) ) { + $this->inBodyMode( 'endtag', 'p' ); + } + $this->stack->insertHTMLElement( $value, $attribs ); + $this->ignoreLinefeed = true; + // OMITTED: frameset_ok + return true; + + case 'form': + if ( + $this->formElementPointer && + $this->stack->indexOf( 'template' ) < 0 + ) { + return true; // in a form, not in a template. + } + if ( $this->stack->inButtonScope( "p" ) ) { + $this->inBodyMode( 'endtag', 'p' ); + } + $elt = $this->stack->insertHTMLElement( $value, $attribs ); + if ( $this->stack->indexOf( 'template' ) < 0 ) { + $this->formElementPointer = $elt; + } + return true; + + case 'li': + // OMITTED: frameset_ok + foreach ( $this->stack as $node ) { + if ( $node->isHtmlNamed( 'li' ) ) { + $this->inBodyMode( 'endtag', 'li' ); + break; + } + if ( + $node->isA( BalanceSets::$specialSet ) && + !$node->isA( BalanceSets::$addressDivPSet ) + ) { + break; + } + } + if ( $this->stack->inButtonScope( 'p' ) ) { + $this->inBodyMode( 'endtag', 'p' ); + } + $this->stack->insertHTMLElement( $value, $attribs ); + return true; + + case 'dd': + case 'dt': + // OMITTED: frameset_ok + foreach ( $this->stack as $node ) { + if ( $node->isHtmlNamed( 'dd' ) ) { + $this->inBodyMode( 'endtag', 'dd' ); + break; + } + if ( $node->isHtmlNamed( 'dt' ) ) { + $this->inBodyMode( 'endtag', 'dt' ); + break; + } + if ( + $node->isA( BalanceSets::$specialSet ) && + !$node->isA( BalanceSets::$addressDivPSet ) + ) { + break; + } + } + if ( $this->stack->inButtonScope( 'p' ) ) { + $this->inBodyMode( 'endtag', 'p' ); + } + $this->stack->insertHTMLElement( $value, $attribs ); + return true; + + // OMITTED: <plaintext> + + case 'button': + if ( $this->stack->inScope( 'button' ) ) { + $this->inBodyMode( 'endtag', 'button' ); + return $this->insertToken( $token, $value, $attribs, $selfClose ); + } + $this->afe->reconstruct( $this->stack ); + $this->stack->insertHTMLElement( $value, $attribs ); + return true; + + case 'a': + $activeElement = $this->afe->findElementByTag( 'a' ); + if ( $activeElement ) { + $this->inBodyMode( 'endtag', 'a' ); + if ( $this->afe->isInList( $activeElement ) ) { + $this->afe->remove( $activeElement ); + // Don't flatten here, since when we fall + // through below we might foster parent + // the new <a> tag inside this one. + $this->stack->removeElement( $activeElement, false ); + } + } + // Falls through + case 'b': + case 'big': + case 'code': + case 'em': + case 'font': + case 'i': + case 's': + case 'small': + case 'strike': + case 'strong': + case 'tt': + case 'u': + $this->afe->reconstruct( $this->stack ); + $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) ); + return true; + + case 'nobr': + $this->afe->reconstruct( $this->stack ); + if ( $this->stack->inScope( 'nobr' ) ) { + $this->inBodyMode( 'endtag', 'nobr' ); + $this->afe->reconstruct( $this->stack ); + } + $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) ); + return true; + + case 'applet': + case 'marquee': + case 'object': + $this->afe->reconstruct( $this->stack ); + $this->stack->insertHTMLElement( $value, $attribs ); + $this->afe->insertMarker(); + // OMITTED: frameset_ok + return true; + + case 'table': + // The document is never in "quirks mode"; see simplifications + // above. + if ( $this->stack->inButtonScope( 'p' ) ) { + $this->inBodyMode( 'endtag', 'p' ); + } + $this->stack->insertHTMLElement( $value, $attribs ); + // OMITTED: frameset_ok + $this->switchMode( 'inTableMode' ); + return true; + + case 'area': + case 'br': + case 'embed': + case 'img': + case 'keygen': + case 'wbr': + $this->afe->reconstruct( $this->stack ); + $this->stack->insertHTMLElement( $value, $attribs ); + $this->stack->pop(); + // OMITTED: frameset_ok + return true; + + case 'input': + $this->afe->reconstruct( $this->stack ); + $this->stack->insertHTMLElement( $value, $attribs ); + $this->stack->pop(); + // OMITTED: frameset_ok + // (hence we don't need to examine the tag's "type" attribute) + return true; + + case 'param': + case 'source': + case 'track': + $this->stack->insertHTMLElement( $value, $attribs ); + $this->stack->pop(); + return true; + + case 'hr': + if ( $this->stack->inButtonScope( 'p' ) ) { + $this->inBodyMode( 'endtag', 'p' ); + } + if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) { + $this->stack->pop(); + } + $this->stack->insertHTMLElement( $value, $attribs ); + $this->stack->pop(); + return true; + + case 'image': + // warts! + return $this->inBodyMode( $token, 'img', $attribs, $selfClose ); + + case 'textarea': + $this->stack->insertHTMLElement( $value, $attribs ); + $this->ignoreLinefeed = true; + $this->inRCDATA = $value; // emulate rcdata tokenizer mode + // OMITTED: frameset_ok + return true; + + // OMITTED: <xmp> + // OMITTED: <iframe> + // OMITTED: <noembed> + // OMITTED: <noscript> + + case 'select': + $this->afe->reconstruct( $this->stack ); + $this->stack->insertHTMLElement( $value, $attribs ); + switch ( $this->parseMode ) { + case 'inTableMode': + case 'inCaptionMode': + case 'inTableBodyMode': + case 'inRowMode': + case 'inCellMode': + $this->switchMode( 'inSelectInTableMode' ); + return true; + default: + $this->switchMode( 'inSelectMode' ); + return true; + } + + case 'optgroup': + case 'option': + if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) { + $this->inBodyMode( 'endtag', 'option' ); + } + $this->afe->reconstruct( $this->stack ); + $this->stack->insertHTMLElement( $value, $attribs ); + return true; + + case 'menuitem': + if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) { + $this->stack->pop(); + } + $this->afe->reconstruct( $this->stack ); + $this->stack->insertHTMLElement( $value, $attribs ); + return true; + + case 'rb': + case 'rtc': + if ( $this->stack->inScope( 'ruby' ) ) { + $this->stack->generateImpliedEndTags(); + } + $this->stack->insertHTMLElement( $value, $attribs ); + return true; + + case 'rp': + case 'rt': + if ( $this->stack->inScope( 'ruby' ) ) { + $this->stack->generateImpliedEndTags( 'rtc' ); + } + $this->stack->insertHTMLElement( $value, $attribs ); + return true; + + case 'math': + $this->afe->reconstruct( $this->stack ); + // We skip the spec's "adjust MathML attributes" and + // "adjust foreign attributes" steps, since the browser will + // do this later when it parses the output and it doesn't affect + // balancing. + $this->stack->insertForeignElement( + BalanceSets::MATHML_NAMESPACE, $value, $attribs + ); + if ( $selfClose ) { + // emit explicit </math> tag. + $this->stack->pop(); + } + return true; + + case 'svg': + $this->afe->reconstruct( $this->stack ); + // We skip the spec's "adjust SVG attributes" and + // "adjust foreign attributes" steps, since the browser will + // do this later when it parses the output and it doesn't affect + // balancing. + $this->stack->insertForeignElement( + BalanceSets::SVG_NAMESPACE, $value, $attribs + ); + if ( $selfClose ) { + // emit explicit </svg> tag. + $this->stack->pop(); + } + return true; + + case 'caption': + case 'col': + case 'colgroup': + // OMITTED: <frame> + case 'head': + case 'tbody': + case 'td': + case 'tfoot': + case 'th': + case 'thead': + case 'tr': + // Ignore table tags if we're not inTableMode + return true; + } + + // Handle any other start tag here + $this->afe->reconstruct( $this->stack ); + $this->stack->insertHTMLElement( $value, $attribs ); + return true; + } elseif ( $token === 'endtag' ) { + switch ( $value ) { + // </body>,</html> are unsupported. + + case 'template': + return $this->inHeadMode( $token, $value, $attribs, $selfClose ); + + case 'address': + case 'article': + case 'aside': + case 'blockquote': + case 'button': + case 'center': + case 'details': + case 'dialog': + case 'dir': + case 'div': + case 'dl': + case 'fieldset': + case 'figcaption': + case 'figure': + case 'footer': + case 'header': + case 'hgroup': + case 'listing': + case 'main': + case 'menu': + case 'nav': + case 'ol': + case 'pre': + case 'section': + case 'summary': + case 'ul': + // Ignore if there is not a matching open tag + if ( !$this->stack->inScope( $value ) ) { + return true; + } + $this->stack->generateImpliedEndTags(); + $this->stack->popTag( $value ); + return true; + + case 'form': + if ( $this->stack->indexOf( 'template' ) < 0 ) { + $openform = $this->formElementPointer; + $this->formElementPointer = null; + if ( !$openform || !$this->stack->inScope( $openform ) ) { + return true; + } + $this->stack->generateImpliedEndTags(); + // Don't flatten yet if we're removing a <form> element + // out-of-order. (eg. `<form><div></form>`) + $flatten = ( $this->stack->currentNode === $openform ); + $this->stack->removeElement( $openform, $flatten ); + } else { + if ( !$this->stack->inScope( 'form' ) ) { + return true; + } + $this->stack->generateImpliedEndTags(); + $this->stack->popTag( 'form' ); + } + return true; + + case 'p': + if ( !$this->stack->inButtonScope( 'p' ) ) { + $this->inBodyMode( 'tag', 'p', [] ); + return $this->insertToken( $token, $value, $attribs, $selfClose ); + } + $this->stack->generateImpliedEndTags( $value ); + $this->stack->popTag( $value ); + return true; + + case 'li': + if ( !$this->stack->inListItemScope( $value ) ) { + return true; // ignore + } + $this->stack->generateImpliedEndTags( $value ); + $this->stack->popTag( $value ); + return true; + + case 'dd': + case 'dt': + if ( !$this->stack->inScope( $value ) ) { + return true; // ignore + } + $this->stack->generateImpliedEndTags( $value ); + $this->stack->popTag( $value ); + return true; + + case 'h1': + case 'h2': + case 'h3': + case 'h4': + case 'h5': + case 'h6': + if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) { + return true; // ignore + } + $this->stack->generateImpliedEndTags(); + $this->stack->popTag( BalanceSets::$headingSet ); + return true; + + case 'sarcasm': + // Take a deep breath, then: + break; + + case 'a': + case 'b': + case 'big': + case 'code': + case 'em': + case 'font': + case 'i': + case 'nobr': + case 's': + case 'small': + case 'strike': + case 'strong': + case 'tt': + case 'u': + if ( $this->stack->adoptionAgency( $value, $this->afe ) ) { + return true; // If we did something, we're done. + } + break; // Go to the "any other end tag" case. + + case 'applet': + case 'marquee': + case 'object': + if ( !$this->stack->inScope( $value ) ) { + return true; // ignore + } + $this->stack->generateImpliedEndTags(); + $this->stack->popTag( $value ); + $this->afe->clearToMarker(); + return true; + + case 'br': + // Turn </br> into <br> + return $this->inBodyMode( 'tag', $value, [] ); + } + + // Any other end tag goes here + foreach ( $this->stack as $i => $node ) { + if ( $node->isHtmlNamed( $value ) ) { + $this->stack->generateImpliedEndTags( $value ); + $this->stack->popTo( $i ); // including $i + break; + } elseif ( $node->isA( BalanceSets::$specialSet ) ) { + return true; // ignore this close token. + } + } + return true; + } elseif ( $token === 'comment' ) { + $this->stack->insertComment( $value ); + return true; + } else { + Assert::invariant( false, "Bad token type: $token" ); + } + } + + private function inTableMode( $token, $value, $attribs = null, $selfClose = false ) { + if ( $token === 'text' ) { + if ( $this->textIntegrationMode ) { + return $this->inBodyMode( $token, $value, $attribs, $selfClose ); + } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) { + $this->pendingTableText = ''; + $this->originalInsertionMode = $this->parseMode; + return $this->switchModeAndReprocess( 'inTableTextMode', + $token, $value, $attribs, $selfClose ); + } + // fall through to default case. + } elseif ( $token === 'eof' ) { + $this->stopParsing(); + return true; + } elseif ( $token === 'tag' ) { + switch ( $value ) { + case 'caption': + $this->afe->insertMarker(); + $this->stack->insertHTMLElement( $value, $attribs ); + $this->switchMode( 'inCaptionMode' ); + return true; + case 'colgroup': + $this->stack->clearToContext( BalanceSets::$tableContextSet ); + $this->stack->insertHTMLElement( $value, $attribs ); + $this->switchMode( 'inColumnGroupMode' ); + return true; + case 'col': + $this->inTableMode( 'tag', 'colgroup', [] ); + return $this->insertToken( $token, $value, $attribs, $selfClose ); + case 'tbody': + case 'tfoot': + case 'thead': + $this->stack->clearToContext( BalanceSets::$tableContextSet ); + $this->stack->insertHTMLElement( $value, $attribs ); + $this->switchMode( 'inTableBodyMode' ); + return true; + case 'td': + case 'th': + case 'tr': + $this->inTableMode( 'tag', 'tbody', [] ); + return $this->insertToken( $token, $value, $attribs, $selfClose ); + case 'table': + if ( !$this->stack->inTableScope( $value ) ) { + return true; // Ignore this tag. + } + $this->inTableMode( 'endtag', $value ); + return $this->insertToken( $token, $value, $attribs, $selfClose ); + + case 'style': + // OMITTED: <script> + case 'template': + return $this->inHeadMode( $token, $value, $attribs, $selfClose ); + + case 'input': + if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) { + break; // Handle this as "everything else" + } + $this->stack->insertHTMLElement( $value, $attribs ); + $this->stack->pop(); + return true; + + case 'form': + if ( + $this->formElementPointer || + $this->stack->indexOf( 'template' ) >= 0 + ) { + return true; // ignore this token + } + $this->formElementPointer = + $this->stack->insertHTMLElement( $value, $attribs ); + $this->stack->popTag( $this->formElementPointer ); + return true; + } + // Fall through for "anything else" clause. + } elseif ( $token === 'endtag' ) { + switch ( $value ) { + case 'table': + if ( !$this->stack->inTableScope( $value ) ) { + return true; // Ignore. + } + $this->stack->popTag( $value ); + $this->resetInsertionMode(); + return true; + // OMITTED: <body> + case 'caption': + case 'col': + case 'colgroup': + // OMITTED: <html> + case 'tbody': + case 'td': + case 'tfoot': + case 'th': + case 'thead': + case 'tr': + return true; // Ignore the token. + case 'template': + return $this->inHeadMode( $token, $value, $attribs, $selfClose ); + } + // Fall through for "anything else" clause. + } elseif ( $token === 'comment' ) { + $this->stack->insertComment( $value ); + return true; + } + // This is the "anything else" case: + $this->stack->fosterParentMode = true; + $this->inBodyMode( $token, $value, $attribs, $selfClose ); + $this->stack->fosterParentMode = false; + return true; + } + + private function inTableTextMode( $token, $value, $attribs = null, $selfClose = false ) { + if ( $token === 'text' ) { + $this->pendingTableText .= $value; + return true; + } + // Non-text token: + $text = $this->pendingTableText; + $this->pendingTableText = ''; + if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) { + // This should match the "anything else" case inTableMode + $this->stack->fosterParentMode = true; + $this->inBodyMode( 'text', $text ); + $this->stack->fosterParentMode = false; + } else { + // Pending text is just whitespace. + $this->stack->insertText( $text ); + } + return $this->switchModeAndReprocess( + $this->originalInsertionMode, $token, $value, $attribs, $selfClose + ); + } + + // helper for inCaptionMode + private function endCaption() { + if ( !$this->stack->inTableScope( 'caption' ) ) { + return false; + } + $this->stack->generateImpliedEndTags(); + $this->stack->popTag( 'caption' ); + $this->afe->clearToMarker(); + $this->switchMode( 'inTableMode' ); + return true; + } + + private function inCaptionMode( $token, $value, $attribs = null, $selfClose = false ) { + if ( $token === 'tag' ) { + switch ( $value ) { + case 'caption': + case 'col': + case 'colgroup': + case 'tbody': + case 'td': + case 'tfoot': + case 'th': + case 'thead': + case 'tr': + if ( $this->endCaption() ) { + $this->insertToken( $token, $value, $attribs, $selfClose ); + } + return true; + } + // Fall through to "anything else" case. + } elseif ( $token === 'endtag' ) { + switch ( $value ) { + case 'caption': + $this->endCaption(); + return true; + case 'table': + if ( $this->endCaption() ) { + $this->insertToken( $token, $value, $attribs, $selfClose ); + } + return true; + case 'body': + case 'col': + case 'colgroup': + // OMITTED: <html> + case 'tbody': + case 'td': + case 'tfoot': + case 'th': + case 'thead': + case 'tr': + // Ignore the token + return true; + } + // Fall through to "anything else" case. + } + // The Anything Else case + return $this->inBodyMode( $token, $value, $attribs, $selfClose ); + } + + private function inColumnGroupMode( $token, $value, $attribs = null, $selfClose = false ) { + if ( $token === 'text' ) { + if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) { + $this->stack->insertText( $matches[0] ); + $value = substr( $value, strlen( $matches[0] ) ); + } + if ( strlen( $value ) === 0 ) { + return true; // All text handled. + } + // Fall through to handle non-whitespace below. + } elseif ( $token === 'tag' ) { + switch ( $value ) { + // OMITTED: <html> + case 'col': + $this->stack->insertHTMLElement( $value, $attribs ); + $this->stack->pop(); + return true; + case 'template': + return $this->inHeadMode( $token, $value, $attribs, $selfClose ); + } + // Fall through for "anything else". + } elseif ( $token === 'endtag' ) { + switch ( $value ) { + case 'colgroup': + if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) { + return true; // Ignore the token. + } + $this->stack->pop(); + $this->switchMode( 'inTableMode' ); + return true; + case 'col': + return true; // Ignore the token. + case 'template': + return $this->inHeadMode( $token, $value, $attribs, $selfClose ); + } + // Fall through for "anything else". + } elseif ( $token === 'eof' ) { + return $this->inBodyMode( $token, $value, $attribs, $selfClose ); + } elseif ( $token === 'comment' ) { + $this->stack->insertComment( $value ); + return true; + } + + // Anything else + if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) { + return true; // Ignore the token. + } + $this->inColumnGroupMode( 'endtag', 'colgroup' ); + return $this->insertToken( $token, $value, $attribs, $selfClose ); + } + + // Helper function for inTableBodyMode + private function endSection() { + if ( !( + $this->stack->inTableScope( 'tbody' ) || + $this->stack->inTableScope( 'thead' ) || + $this->stack->inTableScope( 'tfoot' ) + ) ) { + return false; + } + $this->stack->clearToContext( BalanceSets::$tableBodyContextSet ); + $this->stack->pop(); + $this->switchMode( 'inTableMode' ); + return true; + } + private function inTableBodyMode( $token, $value, $attribs = null, $selfClose = false ) { + if ( $token === 'tag' ) { + switch ( $value ) { + case 'tr': + $this->stack->clearToContext( BalanceSets::$tableBodyContextSet ); + $this->stack->insertHTMLElement( $value, $attribs ); + $this->switchMode( 'inRowMode' ); + return true; + case 'th': + case 'td': + $this->inTableBodyMode( 'tag', 'tr', [] ); + $this->insertToken( $token, $value, $attribs, $selfClose ); + return true; + case 'caption': + case 'col': + case 'colgroup': + case 'tbody': + case 'tfoot': + case 'thead': + if ( $this->endSection() ) { + $this->insertToken( $token, $value, $attribs, $selfClose ); + } + return true; + } + } elseif ( $token === 'endtag' ) { + switch ( $value ) { + case 'table': + if ( $this->endSection() ) { + $this->insertToken( $token, $value, $attribs, $selfClose ); + } + return true; + case 'tbody': + case 'tfoot': + case 'thead': + if ( $this->stack->inTableScope( $value ) ) { + $this->endSection(); + } + return true; + // OMITTED: <body> + case 'caption': + case 'col': + case 'colgroup': + // OMITTED: <html> + case 'td': + case 'th': + case 'tr': + return true; // Ignore the token. + } + } + // Anything else: + return $this->inTableMode( $token, $value, $attribs, $selfClose ); + } + + // Helper function for inRowMode + private function endRow() { + if ( !$this->stack->inTableScope( 'tr' ) ) { + return false; + } + $this->stack->clearToContext( BalanceSets::$tableRowContextSet ); + $this->stack->pop(); + $this->switchMode( 'inTableBodyMode' ); + return true; + } + private function inRowMode( $token, $value, $attribs = null, $selfClose = false ) { + if ( $token === 'tag' ) { + switch ( $value ) { + case 'th': + case 'td': + $this->stack->clearToContext( BalanceSets::$tableRowContextSet ); + $this->stack->insertHTMLElement( $value, $attribs ); + $this->switchMode( 'inCellMode' ); + $this->afe->insertMarker(); + return true; + case 'caption': + case 'col': + case 'colgroup': + case 'tbody': + case 'tfoot': + case 'thead': + case 'tr': + if ( $this->endRow() ) { + $this->insertToken( $token, $value, $attribs, $selfClose ); + } + return true; + } + } elseif ( $token === 'endtag' ) { + switch ( $value ) { + case 'tr': + $this->endRow(); + return true; + case 'table': + if ( $this->endRow() ) { + $this->insertToken( $token, $value, $attribs, $selfClose ); + } + return true; + case 'tbody': + case 'tfoot': + case 'thead': + if ( + $this->stack->inTableScope( $value ) && + $this->endRow() + ) { + $this->insertToken( $token, $value, $attribs, $selfClose ); + } + return true; + // OMITTED: <body> + case 'caption': + case 'col': + case 'colgroup': + // OMITTED: <html> + case 'td': + case 'th': + return true; // Ignore the token. + } + } + // Anything else: + return $this->inTableMode( $token, $value, $attribs, $selfClose ); + } + + // Helper for inCellMode + private function endCell() { + if ( $this->stack->inTableScope( 'td' ) ) { + $this->inCellMode( 'endtag', 'td' ); + return true; + } elseif ( $this->stack->inTableScope( 'th' ) ) { + $this->inCellMode( 'endtag', 'th' ); + return true; + } else { + return false; + } + } + private function inCellMode( $token, $value, $attribs = null, $selfClose = false ) { + if ( $token === 'tag' ) { + switch ( $value ) { + case 'caption': + case 'col': + case 'colgroup': + case 'tbody': + case 'td': + case 'tfoot': + case 'th': + case 'thead': + case 'tr': + if ( $this->endCell() ) { + $this->insertToken( $token, $value, $attribs, $selfClose ); + } + return true; + } + } elseif ( $token === 'endtag' ) { + switch ( $value ) { + case 'td': + case 'th': + if ( $this->stack->inTableScope( $value ) ) { + $this->stack->generateImpliedEndTags(); + $this->stack->popTag( $value ); + $this->afe->clearToMarker(); + $this->switchMode( 'inRowMode' ); + } + return true; + // OMITTED: <body> + case 'caption': + case 'col': + case 'colgroup': + // OMITTED: <html> + return true; + + case 'table': + case 'tbody': + case 'tfoot': + case 'thead': + case 'tr': + if ( $this->stack->inTableScope( $value ) ) { + $this->stack->generateImpliedEndTags(); + $this->stack->popTag( BalanceSets::$tableCellSet ); + $this->afe->clearToMarker(); + $this->switchMode( 'inRowMode' ); + $this->insertToken( $token, $value, $attribs, $selfClose ); + } + return true; + } + } + // Anything else: + return $this->inBodyMode( $token, $value, $attribs, $selfClose ); + } + + private function inSelectMode( $token, $value, $attribs = null, $selfClose = false ) { + if ( $token === 'text' ) { + $this->stack->insertText( $value ); + return true; + } elseif ( $token === 'eof' ) { + return $this->inBodyMode( $token, $value, $attribs, $selfClose ); + } elseif ( $token === 'tag' ) { + switch ( $value ) { + // OMITTED: <html> + case 'option': + if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) { + $this->stack->pop(); + } + $this->stack->insertHTMLElement( $value, $attribs ); + return true; + case 'optgroup': + if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) { + $this->stack->pop(); + } + if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) { + $this->stack->pop(); + } + $this->stack->insertHTMLElement( $value, $attribs ); + return true; + case 'select': + $this->inSelectMode( 'endtag', $value ); // treat it like endtag + return true; + case 'input': + case 'keygen': + case 'textarea': + if ( !$this->stack->inSelectScope( 'select' ) ) { + return true; // ignore token (fragment case) + } + $this->inSelectMode( 'endtag', 'select' ); + return $this->insertToken( $token, $value, $attribs, $selfClose ); + case 'script': + case 'template': + return $this->inHeadMode( $token, $value, $attribs, $selfClose ); + } + } elseif ( $token === 'endtag' ) { + switch ( $value ) { + case 'optgroup': + if ( + $this->stack->currentNode->isHtmlNamed( 'option' ) && + $this->stack->length() >= 2 && + $this->stack->node( $this->stack->length() - 2 )->isHtmlNamed( 'optgroup' ) + ) { + $this->stack->pop(); + } + if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) { + $this->stack->pop(); + } + return true; + case 'option': + if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) { + $this->stack->pop(); + } + return true; + case 'select': + if ( !$this->stack->inSelectScope( $value ) ) { + return true; // fragment case + } + $this->stack->popTag( $value ); + $this->resetInsertionMode(); + return true; + case 'template': + return $this->inHeadMode( $token, $value, $attribs, $selfClose ); + } + } elseif ( $token === 'comment' ) { + $this->stack->insertComment( $value ); + return true; + } + // anything else: just ignore the token + return true; + } + + private function inSelectInTableMode( $token, $value, $attribs = null, $selfClose = false ) { + switch ( $value ) { + case 'caption': + case 'table': + case 'tbody': + case 'tfoot': + case 'thead': + case 'tr': + case 'td': + case 'th': + if ( $token === 'tag' ) { + $this->inSelectInTableMode( 'endtag', 'select' ); + return $this->insertToken( $token, $value, $attribs, $selfClose ); + } elseif ( $token === 'endtag' ) { + if ( $this->stack->inTableScope( $value ) ) { + $this->inSelectInTableMode( 'endtag', 'select' ); + return $this->insertToken( $token, $value, $attribs, $selfClose ); + } + return true; + } + } + // anything else + return $this->inSelectMode( $token, $value, $attribs, $selfClose ); + } + + private function inTemplateMode( $token, $value, $attribs = null, $selfClose = false ) { + if ( $token === 'text' || $token === 'comment' ) { + return $this->inBodyMode( $token, $value, $attribs, $selfClose ); + } elseif ( $token === 'eof' ) { + if ( $this->stack->indexOf( 'template' ) < 0 ) { + $this->stopParsing(); + } else { + $this->stack->popTag( 'template' ); + $this->afe->clearToMarker(); + array_pop( $this->templateInsertionModes ); + $this->resetInsertionMode(); + $this->insertToken( $token, $value, $attribs, $selfClose ); + } + return true; + } elseif ( $token === 'tag' ) { + switch ( $value ) { + case 'base': + case 'basefont': + case 'bgsound': + case 'link': + case 'meta': + case 'noframes': + // OMITTED: <script> + case 'style': + case 'template': + // OMITTED: <title> + return $this->inHeadMode( $token, $value, $attribs, $selfClose ); + + case 'caption': + case 'colgroup': + case 'tbody': + case 'tfoot': + case 'thead': + return $this->switchModeAndReprocess( + 'inTableMode', $token, $value, $attribs, $selfClose + ); + + case 'col': + return $this->switchModeAndReprocess( + 'inColumnGroupMode', $token, $value, $attribs, $selfClose + ); + + case 'tr': + return $this->switchModeAndReprocess( + 'inTableBodyMode', $token, $value, $attribs, $selfClose + ); + + case 'td': + case 'th': + return $this->switchModeAndReprocess( + 'inRowMode', $token, $value, $attribs, $selfClose + ); + } + return $this->switchModeAndReprocess( + 'inBodyMode', $token, $value, $attribs, $selfClose + ); + } elseif ( $token === 'endtag' ) { + switch ( $value ) { + case 'template': + return $this->inHeadMode( $token, $value, $attribs, $selfClose ); + } + return true; + } else { + Assert::invariant( false, "Bad token type: $token" ); + } + } +} |