diff options
Diffstat (limited to 'platform/www/inc/Parsing/Lexer')
-rw-r--r-- | platform/www/inc/Parsing/Lexer/Lexer.php | 349 | ||||
-rw-r--r-- | platform/www/inc/Parsing/Lexer/ParallelRegex.php | 203 | ||||
-rw-r--r-- | platform/www/inc/Parsing/Lexer/StateStack.php | 60 |
3 files changed, 612 insertions, 0 deletions
diff --git a/platform/www/inc/Parsing/Lexer/Lexer.php b/platform/www/inc/Parsing/Lexer/Lexer.php new file mode 100644 index 0000000..edcd251 --- /dev/null +++ b/platform/www/inc/Parsing/Lexer/Lexer.php @@ -0,0 +1,349 @@ +<?php +/** + * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ + * For an intro to the Lexer see: + * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes + * + * @author Marcus Baker http://www.lastcraft.com + */ + +namespace dokuwiki\Parsing\Lexer; + +/** + * Accepts text and breaks it into tokens. + * + * Some optimisation to make the sure the content is only scanned by the PHP regex + * parser once. Lexer modes must not start with leading underscores. + */ +class Lexer +{ + /** @var ParallelRegex[] */ + protected $regexes; + /** @var \Doku_Handler */ + protected $handler; + /** @var StateStack */ + protected $modeStack; + /** @var array mode "rewrites" */ + protected $mode_handlers; + /** @var bool case sensitive? */ + protected $case; + + /** + * Sets up the lexer in case insensitive matching by default. + * + * @param \Doku_Handler $handler Handling strategy by reference. + * @param string $start Starting handler. + * @param boolean $case True for case sensitive. + */ + public function __construct($handler, $start = "accept", $case = false) + { + $this->case = $case; + $this->regexes = array(); + $this->handler = $handler; + $this->modeStack = new StateStack($start); + $this->mode_handlers = array(); + } + + /** + * Adds a token search pattern for a particular parsing mode. + * + * The pattern does not change the current mode. + * + * @param string $pattern Perl style regex, but ( and ) + * lose the usual meaning. + * @param string $mode Should only apply this + * pattern when dealing with + * this type of input. + */ + public function addPattern($pattern, $mode = "accept") + { + if (! isset($this->regexes[$mode])) { + $this->regexes[$mode] = new ParallelRegex($this->case); + } + $this->regexes[$mode]->addPattern($pattern); + } + + /** + * Adds a pattern that will enter a new parsing mode. + * + * Useful for entering parenthesis, strings, tags, etc. + * + * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. + * @param string $mode Should only apply this pattern when dealing with this type of input. + * @param string $new_mode Change parsing to this new nested mode. + */ + public function addEntryPattern($pattern, $mode, $new_mode) + { + if (! isset($this->regexes[$mode])) { + $this->regexes[$mode] = new ParallelRegex($this->case); + } + $this->regexes[$mode]->addPattern($pattern, $new_mode); + } + + /** + * Adds a pattern that will exit the current mode and re-enter the previous one. + * + * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. + * @param string $mode Mode to leave. + */ + public function addExitPattern($pattern, $mode) + { + if (! isset($this->regexes[$mode])) { + $this->regexes[$mode] = new ParallelRegex($this->case); + } + $this->regexes[$mode]->addPattern($pattern, "__exit"); + } + + /** + * Adds a pattern that has a special mode. + * + * Acts as an entry and exit pattern in one go, effectively calling a special + * parser handler for this token only. + * + * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. + * @param string $mode Should only apply this pattern when dealing with this type of input. + * @param string $special Use this mode for this one token. + */ + public function addSpecialPattern($pattern, $mode, $special) + { + if (! isset($this->regexes[$mode])) { + $this->regexes[$mode] = new ParallelRegex($this->case); + } + $this->regexes[$mode]->addPattern($pattern, "_$special"); + } + + /** + * Adds a mapping from a mode to another handler. + * + * @param string $mode Mode to be remapped. + * @param string $handler New target handler. + */ + public function mapHandler($mode, $handler) + { + $this->mode_handlers[$mode] = $handler; + } + + /** + * Splits the page text into tokens. + * + * Will fail if the handlers report an error or if no content is consumed. If successful then each + * unparsed and parsed token invokes a call to the held listener. + * + * @param string $raw Raw HTML text. + * @return boolean True on success, else false. + */ + public function parse($raw) + { + if (! isset($this->handler)) { + return false; + } + $initialLength = strlen($raw); + $length = $initialLength; + $pos = 0; + while (is_array($parsed = $this->reduce($raw))) { + list($unmatched, $matched, $mode) = $parsed; + $currentLength = strlen($raw); + $matchPos = $initialLength - $currentLength - strlen($matched); + if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) { + return false; + } + if ($currentLength == $length) { + return false; + } + $length = $currentLength; + $pos = $initialLength - $currentLength; + } + if (!$parsed) { + return false; + } + return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos); + } + + /** + * Gives plugins access to the mode stack + * + * @return StateStack + */ + public function getModeStack() + { + return $this->modeStack; + } + + /** + * Sends the matched token and any leading unmatched + * text to the parser changing the lexer to a new + * mode if one is listed. + * + * @param string $unmatched Unmatched leading portion. + * @param string $matched Actual token match. + * @param bool|string $mode Mode after match. A boolean false mode causes no change. + * @param int $initialPos + * @param int $matchPos Current byte index location in raw doc thats being parsed + * @return boolean False if there was any error from the parser. + */ + protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos) + { + if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { + return false; + } + if ($this->isModeEnd($mode)) { + if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { + return false; + } + return $this->modeStack->leave(); + } + if ($this->isSpecialMode($mode)) { + $this->modeStack->enter($this->decodeSpecial($mode)); + if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { + return false; + } + return $this->modeStack->leave(); + } + if (is_string($mode)) { + $this->modeStack->enter($mode); + return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); + } + return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); + } + + /** + * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching + * mode stack. + * + * @param string $mode Mode to test. + * @return boolean True if this is the exit mode. + */ + protected function isModeEnd($mode) + { + return ($mode === "__exit"); + } + + /** + * Test to see if the mode is one where this mode is entered for this token only and automatically + * leaves immediately afterwoods. + * + * @param string $mode Mode to test. + * @return boolean True if this is the exit mode. + */ + protected function isSpecialMode($mode) + { + return (strncmp($mode, "_", 1) == 0); + } + + /** + * Strips the magic underscore marking single token modes. + * + * @param string $mode Mode to decode. + * @return string Underlying mode name. + */ + protected function decodeSpecial($mode) + { + return substr($mode, 1); + } + + /** + * Calls the parser method named after the current mode. + * + * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer. + * + * @param string $content Text parsed. + * @param boolean $is_match Token is recognised rather + * than unparsed data. + * @param int $pos Current byte index location in raw doc + * thats being parsed + * @return bool + */ + protected function invokeHandler($content, $is_match, $pos) + { + if (($content === "") || ($content === false)) { + return true; + } + $handler = $this->modeStack->getCurrent(); + if (isset($this->mode_handlers[$handler])) { + $handler = $this->mode_handlers[$handler]; + } + + // modes starting with plugin_ are all handled by the same + // handler but with an additional parameter + if (substr($handler, 0, 7)=='plugin_') { + list($handler,$plugin) = explode('_', $handler, 2); + return $this->handler->$handler($content, $is_match, $pos, $plugin); + } + + return $this->handler->$handler($content, $is_match, $pos); + } + + /** + * Tries to match a chunk of text and if successful removes the recognised chunk and any leading + * unparsed data. Empty strings will not be matched. + * + * @param string $raw The subject to parse. This is the content that will be eaten. + * @return array|bool Three item list of unparsed content followed by the + * recognised token and finally the action the parser is to take. + * True if no match, false if there is a parsing error. + */ + protected function reduce(&$raw) + { + if (! isset($this->regexes[$this->modeStack->getCurrent()])) { + return false; + } + if ($raw === "") { + return true; + } + if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) { + list($unparsed, $match, $raw) = $split; + return array($unparsed, $match, $action); + } + return true; + } + + /** + * Escapes regex characters other than (, ) and / + * + * @param string $str + * @return string + */ + public static function escape($str) + { + $chars = array( + '/\\\\/', + '/\./', + '/\+/', + '/\*/', + '/\?/', + '/\[/', + '/\^/', + '/\]/', + '/\$/', + '/\{/', + '/\}/', + '/\=/', + '/\!/', + '/\</', + '/\>/', + '/\|/', + '/\:/' + ); + + $escaped = array( + '\\\\\\\\', + '\.', + '\+', + '\*', + '\?', + '\[', + '\^', + '\]', + '\$', + '\{', + '\}', + '\=', + '\!', + '\<', + '\>', + '\|', + '\:' + ); + return preg_replace($chars, $escaped, $str); + } +} diff --git a/platform/www/inc/Parsing/Lexer/ParallelRegex.php b/platform/www/inc/Parsing/Lexer/ParallelRegex.php new file mode 100644 index 0000000..96f61a1 --- /dev/null +++ b/platform/www/inc/Parsing/Lexer/ParallelRegex.php @@ -0,0 +1,203 @@ +<?php +/** + * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ + * For an intro to the Lexer see: + * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes + * + * @author Marcus Baker http://www.lastcraft.com + */ + +namespace dokuwiki\Parsing\Lexer; + +/** + * Compounded regular expression. + * + * Any of the contained patterns could match and when one does it's label is returned. + */ +class ParallelRegex +{ + /** @var string[] patterns to match */ + protected $patterns; + /** @var string[] labels for above patterns */ + protected $labels; + /** @var string the compound regex matching all patterns */ + protected $regex; + /** @var bool case sensitive matching? */ + protected $case; + + /** + * Constructor. Starts with no patterns. + * + * @param boolean $case True for case sensitive, false + * for insensitive. + */ + public function __construct($case) + { + $this->case = $case; + $this->patterns = array(); + $this->labels = array(); + $this->regex = null; + } + + /** + * Adds a pattern with an optional label. + * + * @param mixed $pattern Perl style regex. Must be UTF-8 + * encoded. If its a string, the (, ) + * lose their meaning unless they + * form part of a lookahead or + * lookbehind assertation. + * @param bool|string $label Label of regex to be returned + * on a match. Label must be ASCII + */ + public function addPattern($pattern, $label = true) + { + $count = count($this->patterns); + $this->patterns[$count] = $pattern; + $this->labels[$count] = $label; + $this->regex = null; + } + + /** + * Attempts to match all patterns at once against a string. + * + * @param string $subject String to match against. + * @param string $match First matched portion of + * subject. + * @return bool|string False if no match found, label if label exists, true if not + */ + public function match($subject, &$match) + { + if (count($this->patterns) == 0) { + return false; + } + if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) { + $match = ""; + return false; + } + + $match = $matches[0]; + $size = count($matches); + // FIXME this could be made faster by storing the labels as keys in a hashmap + for ($i = 1; $i < $size; $i++) { + if ($matches[$i] && isset($this->labels[$i - 1])) { + return $this->labels[$i - 1]; + } + } + return true; + } + + /** + * Attempts to split the string against all patterns at once + * + * @param string $subject String to match against. + * @param array $split The split result: array containing, pre-match, match & post-match strings + * @return boolean True on success. + * + * @author Christopher Smith <chris@jalakai.co.uk> + */ + public function split($subject, &$split) + { + if (count($this->patterns) == 0) { + return false; + } + + if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) { + if (function_exists('preg_last_error')) { + $err = preg_last_error(); + switch ($err) { + case PREG_BACKTRACK_LIMIT_ERROR: + msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1); + break; + case PREG_RECURSION_LIMIT_ERROR: + msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1); + break; + case PREG_BAD_UTF8_ERROR: + msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1); + break; + case PREG_INTERNAL_ERROR: + msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1); + break; + } + } + + $split = array($subject, "", ""); + return false; + } + + $idx = count($matches)-2; + list($pre, $post) = preg_split($this->patterns[$idx].$this->getPerlMatchingFlags(), $subject, 2); + $split = array($pre, $matches[0], $post); + + return isset($this->labels[$idx]) ? $this->labels[$idx] : true; + } + + /** + * Compounds the patterns into a single + * regular expression separated with the + * "or" operator. Caches the regex. + * Will automatically escape (, ) and / tokens. + * + * @return null|string + */ + protected function getCompoundedRegex() + { + if ($this->regex == null) { + $cnt = count($this->patterns); + for ($i = 0; $i < $cnt; $i++) { + /* + * decompose the input pattern into "(", "(?", ")", + * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"... + * elements. + */ + preg_match_all('/\\\\.|' . + '\(\?|' . + '[()]|' . + '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' . + '[^[()\\\\]+/', $this->patterns[$i], $elts); + + $pattern = ""; + $level = 0; + + foreach ($elts[0] as $elt) { + /* + * for "(", ")" remember the nesting level, add "\" + * only to the non-"(?" ones. + */ + + switch ($elt) { + case '(': + $pattern .= '\('; + break; + case ')': + if ($level > 0) + $level--; /* closing (? */ + else $pattern .= '\\'; + $pattern .= ')'; + break; + case '(?': + $level++; + $pattern .= '(?'; + break; + default: + if (substr($elt, 0, 1) == '\\') + $pattern .= $elt; + else $pattern .= str_replace('/', '\/', $elt); + } + } + $this->patterns[$i] = "($pattern)"; + } + $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags(); + } + return $this->regex; + } + + /** + * Accessor for perl regex mode flags to use. + * @return string Perl regex flags. + */ + protected function getPerlMatchingFlags() + { + return ($this->case ? "msS" : "msSi"); + } +} diff --git a/platform/www/inc/Parsing/Lexer/StateStack.php b/platform/www/inc/Parsing/Lexer/StateStack.php new file mode 100644 index 0000000..325412b --- /dev/null +++ b/platform/www/inc/Parsing/Lexer/StateStack.php @@ -0,0 +1,60 @@ +<?php +/** + * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ + * For an intro to the Lexer see: + * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes + * + * @author Marcus Baker http://www.lastcraft.com + */ + +namespace dokuwiki\Parsing\Lexer; + +/** + * States for a stack machine. + */ +class StateStack +{ + protected $stack; + + /** + * Constructor. Starts in named state. + * @param string $start Starting state name. + */ + public function __construct($start) + { + $this->stack = array($start); + } + + /** + * Accessor for current state. + * @return string State. + */ + public function getCurrent() + { + return $this->stack[count($this->stack) - 1]; + } + + /** + * Adds a state to the stack and sets it to be the current state. + * + * @param string $state New state. + */ + public function enter($state) + { + array_push($this->stack, $state); + } + + /** + * Leaves the current state and reverts + * to the previous one. + * @return boolean false if we attempt to drop off the bottom of the list. + */ + public function leave() + { + if (count($this->stack) == 1) { + return false; + } + array_pop($this->stack); + return true; + } +} |