summaryrefslogtreecommitdiff
path: root/www/wiki/extensions/AbuseFilter/includes/parser/AbuseFilterTokenizer.php
diff options
context:
space:
mode:
Diffstat (limited to 'www/wiki/extensions/AbuseFilter/includes/parser/AbuseFilterTokenizer.php')
-rw-r--r--www/wiki/extensions/AbuseFilter/includes/parser/AbuseFilterTokenizer.php258
1 files changed, 258 insertions, 0 deletions
diff --git a/www/wiki/extensions/AbuseFilter/includes/parser/AbuseFilterTokenizer.php b/www/wiki/extensions/AbuseFilter/includes/parser/AbuseFilterTokenizer.php
new file mode 100644
index 00000000..a97fccaf
--- /dev/null
+++ b/www/wiki/extensions/AbuseFilter/includes/parser/AbuseFilterTokenizer.php
@@ -0,0 +1,258 @@
+<?php
+
+use MediaWiki\MediaWikiServices;
+
+/**
+ * Tokenizer for AbuseFilter rules.
+ */
+class AbuseFilterTokenizer {
+ /** @var int Tokenizer cache version. Increment this when changing the syntax. **/
+ const CACHE_VERSION = 1;
+ const COMMENT_START_RE = '/\s*\/\*/A';
+ const ID_SYMBOL_RE = '/[0-9A-Za-z_]+/A';
+ const OPERATOR_RE =
+ '/(\!\=\=|\!\=|\!|\*\*|\*|\/|\+|\-|%|&|\||\^|\:\=|\?|\:|\<\=|\<|\>\=|\>|\=\=\=|\=\=|\=)/A';
+ const RADIX_RE = '/([0-9A-Fa-f]+(?:\.\d*)?|\.\d+)([bxo])?/Au';
+ const WHITESPACE = "\011\012\013\014\015\040";
+
+ // Order is important. The punctuation-matching regex requires that
+ // ** comes before *, etc. They are sorted to make it easy to spot
+ // such errors.
+ public static $operators = [
+ '!==', '!=', '!', // Inequality
+ '**', '*', // Multiplication/exponentiation
+ '/', '+', '-', '%', // Other arithmetic
+ '&', '|', '^', // Logic
+ ':=', // Setting
+ '?', ':', // Ternery
+ '<=', '<', // Less than
+ '>=', '>', // Greater than
+ '===', '==', '=', // Equality
+ ];
+
+ public static $punctuation = [
+ ',' => AFPToken::TCOMMA,
+ '(' => AFPToken::TBRACE,
+ ')' => AFPToken::TBRACE,
+ '[' => AFPToken::TSQUAREBRACKET,
+ ']' => AFPToken::TSQUAREBRACKET,
+ ';' => AFPToken::TSTATEMENTSEPARATOR,
+ ];
+
+ public static $bases = [
+ 'b' => 2,
+ 'x' => 16,
+ 'o' => 8
+ ];
+
+ public static $baseCharsRe = [
+ 2 => '/^[01]+$/',
+ 8 => '/^[0-8]+$/',
+ 16 => '/^[0-9A-Fa-f]+$/',
+ 10 => '/^[0-9.]+$/',
+ ];
+
+ public static $keywords = [
+ 'in', 'like', 'true', 'false', 'null', 'contains', 'matches',
+ 'rlike', 'irlike', 'regex', 'if', 'then', 'else', 'end',
+ ];
+
+ /**
+ * @param string $code
+ * @return array
+ * @throws AFPException
+ * @throws AFPUserVisibleException
+ */
+ static function tokenize( $code ) {
+ static $tokenizerCache = null;
+
+ if ( !$tokenizerCache ) {
+ $tokenizerCache = ObjectCache::getLocalServerInstance( 'hash' );
+ }
+
+ static $stats = null;
+
+ if ( !$stats ) {
+ $stats = MediaWikiServices::getInstance()->getStatsdDataFactory();
+ }
+
+ $cacheKey = wfGlobalCacheKey( __CLASS__, self::CACHE_VERSION, crc32( $code ) );
+
+ $tokens = $tokenizerCache->get( $cacheKey );
+
+ if ( $tokens ) {
+ $stats->increment( 'AbuseFilter.tokenizerCache.hit' );
+ return $tokens;
+ }
+
+ $stats->increment( 'AbuseFilter.tokenizerCache.miss' );
+ $tokens = [];
+ $curPos = 0;
+
+ do {
+ $prevPos = $curPos;
+ $token = self::nextToken( $code, $curPos );
+ $tokens[ $token->pos ] = [ $token, $curPos ];
+ } while ( $curPos !== $prevPos );
+
+ $tokenizerCache->set( $cacheKey, $tokens, 60 * 60 * 24 );
+
+ return $tokens;
+ }
+
+ /**
+ * @param string $code
+ * @param int &$offset
+ * @return AFPToken
+ * @throws AFPException
+ * @throws AFPUserVisibleException
+ */
+ protected static function nextToken( $code, &$offset ) {
+ $matches = [];
+ $start = $offset;
+
+ // Read past comments
+ while ( preg_match( self::COMMENT_START_RE, $code, $matches, 0, $offset ) ) {
+ if ( strpos( $code, '*/', $offset ) === false ) {
+ throw new AFPUserVisibleException(
+ 'unclosedcomment', $offset, [] );
+ }
+ $offset = strpos( $code, '*/', $offset ) + 2;
+ }
+
+ // Spaces
+ $offset += strspn( $code, self::WHITESPACE, $offset );
+ if ( $offset >= strlen( $code ) ) {
+ return new AFPToken( AFPToken::TNONE, '', $start );
+ }
+
+ $chr = $code[$offset];
+
+ // Punctuation
+ if ( isset( self::$punctuation[$chr] ) ) {
+ $offset++;
+ return new AFPToken( self::$punctuation[$chr], $chr, $start );
+ }
+
+ // String literal
+ if ( $chr === '"' || $chr === "'" ) {
+ return self::readStringLiteral( $code, $offset, $start );
+ }
+
+ $matches = [];
+
+ // Operators
+ if ( preg_match( self::OPERATOR_RE, $code, $matches, 0, $offset ) ) {
+ $token = $matches[0];
+ $offset += strlen( $token );
+ return new AFPToken( AFPToken::TOP, $token, $start );
+ }
+
+ // Numbers
+ if ( preg_match( self::RADIX_RE, $code, $matches, 0, $offset ) ) {
+ $token = $matches[0];
+ $input = $matches[1];
+ $baseChar = isset( $matches[2] ) ? $matches[2] : null;
+ // Sometimes the base char gets mixed in with the rest of it because
+ // the regex targets hex, too.
+ // This mostly happens with binary
+ if ( !$baseChar && !empty( self::$bases[ substr( $input, - 1 ) ] ) ) {
+ $baseChar = substr( $input, - 1, 1 );
+ $input = substr( $input, 0, - 1 );
+ }
+
+ $base = $baseChar ? self::$bases[$baseChar] : 10;
+
+ // Check against the appropriate character class for input validation
+
+ if ( preg_match( self::$baseCharsRe[$base], $input ) ) {
+ $num = $base !== 10 ? base_convert( $input, $base, 10 ) : $input;
+ $offset += strlen( $token );
+ return ( strpos( $input, '.' ) !== false )
+ ? new AFPToken( AFPToken::TFLOAT, floatval( $num ), $start )
+ : new AFPToken( AFPToken::TINT, intval( $num ), $start );
+ }
+ }
+
+ // IDs / Keywords
+
+ if ( preg_match( self::ID_SYMBOL_RE, $code, $matches, 0, $offset ) ) {
+ $token = $matches[0];
+ $offset += strlen( $token );
+ $type = in_array( $token, self::$keywords )
+ ? AFPToken::TKEYWORD
+ : AFPToken::TID;
+ return new AFPToken( $type, $token, $start );
+ }
+
+ throw new AFPUserVisibleException(
+ 'unrecognisedtoken', $start, [ substr( $code, $start ) ] );
+ }
+
+ /**
+ * @param string $code
+ * @param int &$offset
+ * @param int $start
+ * @return AFPToken
+ * @throws AFPException
+ * @throws AFPUserVisibleException
+ */
+ protected static function readStringLiteral( $code, &$offset, $start ) {
+ $type = $code[$offset];
+ $offset++;
+ $length = strlen( $code );
+ $token = '';
+ while ( $offset < $length ) {
+ if ( $code[$offset] === $type ) {
+ $offset++;
+ return new AFPToken( AFPToken::TSTRING, $token, $start );
+ }
+
+ // Performance: Use a PHP function (implemented in C)
+ // to scan ahead.
+ $addLength = strcspn( $code, $type . "\\", $offset );
+ if ( $addLength ) {
+ $token .= substr( $code, $offset, $addLength );
+ $offset += $addLength;
+ } elseif ( $code[$offset] == '\\' ) {
+ switch ( $code[$offset + 1] ) {
+ case '\\':
+ $token .= '\\';
+ break;
+ case $type:
+ $token .= $type;
+ break;
+ case 'n';
+ $token .= "\n";
+ break;
+ case 'r':
+ $token .= "\r";
+ break;
+ case 't':
+ $token .= "\t";
+ break;
+ case 'x':
+ $chr = substr( $code, $offset + 2, 2 );
+
+ if ( preg_match( '/^[0-9A-Fa-f]{2}$/', $chr ) ) {
+ $chr = base_convert( $chr, 16, 10 );
+ $token .= chr( $chr );
+ $offset += 2; # \xXX -- 2 done later
+ } else {
+ $token .= 'x';
+ }
+ break;
+ default:
+ $token .= "\\" . $code[$offset + 1];
+ }
+
+ $offset += 2;
+
+ } else {
+ $token .= $code[$offset];
+ $offset++;
+ }
+ }
+ throw new AFPUserVisibleException( 'unclosedstring', $offset, [] );
+ }
+}