summaryrefslogtreecommitdiff
path: root/www/wiki/tests/parser/ParserTestResultNormalizer.php
blob: fbeed97bc0304ae1587a605f885c78b95bd9056d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
<?php
/**
 * @file
 * @ingroup Testing
 */

class ParserTestResultNormalizer {
	protected $doc, $xpath, $invalid;

	public static function normalize( $text, $funcs ) {
		$norm = new self( $text );
		if ( $norm->invalid ) {
			return $text;
		}
		foreach ( $funcs as $func ) {
			$norm->$func();
		}
		return $norm->serialize();
	}

	protected function __construct( $text ) {
		$this->doc = new DOMDocument( '1.0', 'utf-8' );

		// Note: parsing a supposedly XHTML document with an XML parser is not
		// guaranteed to give accurate results. For example, it may introduce
		// differences in the number of line breaks in <pre> tags.

		Wikimedia\suppressWarnings();
		if ( !$this->doc->loadXML( '<html><body>' . $text . '</body></html>' ) ) {
			$this->invalid = true;
		}
		Wikimedia\restoreWarnings();
		$this->xpath = new DOMXPath( $this->doc );
		$this->body = $this->xpath->query( '//body' )->item( 0 );
	}

	protected function removeTbody() {
		foreach ( $this->xpath->query( '//tbody' ) as $tbody ) {
			while ( $tbody->firstChild ) {
				$child = $tbody->firstChild;
				$tbody->removeChild( $child );
				$tbody->parentNode->insertBefore( $child, $tbody );
			}
			$tbody->parentNode->removeChild( $tbody );
		}
	}

	/**
	 * The point of this function is to produce a normalized DOM in which
	 * Tidy's output matches the output of html5depurate. Tidy both trims
	 * and pretty-prints, so this requires fairly aggressive treatment.
	 *
	 * In particular, note that Tidy converts <pre>x</pre> to <pre>\nx\n</pre>,
	 * which theoretically affects display since the second line break is not
	 * ignored by compliant HTML parsers.
	 *
	 * This function also removes empty elements, as does Tidy.
	 */
	protected function trimWhitespace() {
		foreach ( $this->xpath->query( '//text()' ) as $child ) {
			if ( strtolower( $child->parentNode->nodeName ) === 'pre' ) {
				// Just trim one line break from the start and end
				if ( substr_compare( $child->data, "\n", 0 ) === 0 ) {
					$child->data = substr( $child->data, 1 );
				}
				if ( substr_compare( $child->data, "\n", -1 ) === 0 ) {
					$child->data = substr( $child->data, 0, -1 );
				}
			} else {
				// Trim all whitespace
				$child->data = trim( $child->data );
			}
			if ( $child->data === '' ) {
				$child->parentNode->removeChild( $child );
			}
		}
	}

	/**
	 * Serialize the XML DOM for comparison purposes. This does not generate HTML.
	 * @return string
	 */
	protected function serialize() {
		return strtr( $this->doc->saveXML( $this->body ),
			[ '<body>' => '', '</body>' => '' ] );
	}
}