summaryrefslogtreecommitdiff
path: root/platform/www/inc/Parsing/Lexer
diff options
context:
space:
mode:
Diffstat (limited to 'platform/www/inc/Parsing/Lexer')
-rw-r--r--platform/www/inc/Parsing/Lexer/Lexer.php349
-rw-r--r--platform/www/inc/Parsing/Lexer/ParallelRegex.php203
-rw-r--r--platform/www/inc/Parsing/Lexer/StateStack.php60
3 files changed, 612 insertions, 0 deletions
diff --git a/platform/www/inc/Parsing/Lexer/Lexer.php b/platform/www/inc/Parsing/Lexer/Lexer.php
new file mode 100644
index 0000000..edcd251
--- /dev/null
+++ b/platform/www/inc/Parsing/Lexer/Lexer.php
@@ -0,0 +1,349 @@
+<?php
+/**
+ * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
+ * For an intro to the Lexer see:
+ * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
+ *
+ * @author Marcus Baker http://www.lastcraft.com
+ */
+
+namespace dokuwiki\Parsing\Lexer;
+
+/**
+ * Accepts text and breaks it into tokens.
+ *
+ * Some optimisation to make the sure the content is only scanned by the PHP regex
+ * parser once. Lexer modes must not start with leading underscores.
+ */
+class Lexer
+{
+ /** @var ParallelRegex[] */
+ protected $regexes;
+ /** @var \Doku_Handler */
+ protected $handler;
+ /** @var StateStack */
+ protected $modeStack;
+ /** @var array mode "rewrites" */
+ protected $mode_handlers;
+ /** @var bool case sensitive? */
+ protected $case;
+
+ /**
+ * Sets up the lexer in case insensitive matching by default.
+ *
+ * @param \Doku_Handler $handler Handling strategy by reference.
+ * @param string $start Starting handler.
+ * @param boolean $case True for case sensitive.
+ */
+ public function __construct($handler, $start = "accept", $case = false)
+ {
+ $this->case = $case;
+ $this->regexes = array();
+ $this->handler = $handler;
+ $this->modeStack = new StateStack($start);
+ $this->mode_handlers = array();
+ }
+
+ /**
+ * Adds a token search pattern for a particular parsing mode.
+ *
+ * The pattern does not change the current mode.
+ *
+ * @param string $pattern Perl style regex, but ( and )
+ * lose the usual meaning.
+ * @param string $mode Should only apply this
+ * pattern when dealing with
+ * this type of input.
+ */
+ public function addPattern($pattern, $mode = "accept")
+ {
+ if (! isset($this->regexes[$mode])) {
+ $this->regexes[$mode] = new ParallelRegex($this->case);
+ }
+ $this->regexes[$mode]->addPattern($pattern);
+ }
+
+ /**
+ * Adds a pattern that will enter a new parsing mode.
+ *
+ * Useful for entering parenthesis, strings, tags, etc.
+ *
+ * @param string $pattern Perl style regex, but ( and ) lose the usual meaning.
+ * @param string $mode Should only apply this pattern when dealing with this type of input.
+ * @param string $new_mode Change parsing to this new nested mode.
+ */
+ public function addEntryPattern($pattern, $mode, $new_mode)
+ {
+ if (! isset($this->regexes[$mode])) {
+ $this->regexes[$mode] = new ParallelRegex($this->case);
+ }
+ $this->regexes[$mode]->addPattern($pattern, $new_mode);
+ }
+
+ /**
+ * Adds a pattern that will exit the current mode and re-enter the previous one.
+ *
+ * @param string $pattern Perl style regex, but ( and ) lose the usual meaning.
+ * @param string $mode Mode to leave.
+ */
+ public function addExitPattern($pattern, $mode)
+ {
+ if (! isset($this->regexes[$mode])) {
+ $this->regexes[$mode] = new ParallelRegex($this->case);
+ }
+ $this->regexes[$mode]->addPattern($pattern, "__exit");
+ }
+
+ /**
+ * Adds a pattern that has a special mode.
+ *
+ * Acts as an entry and exit pattern in one go, effectively calling a special
+ * parser handler for this token only.
+ *
+ * @param string $pattern Perl style regex, but ( and ) lose the usual meaning.
+ * @param string $mode Should only apply this pattern when dealing with this type of input.
+ * @param string $special Use this mode for this one token.
+ */
+ public function addSpecialPattern($pattern, $mode, $special)
+ {
+ if (! isset($this->regexes[$mode])) {
+ $this->regexes[$mode] = new ParallelRegex($this->case);
+ }
+ $this->regexes[$mode]->addPattern($pattern, "_$special");
+ }
+
+ /**
+ * Adds a mapping from a mode to another handler.
+ *
+ * @param string $mode Mode to be remapped.
+ * @param string $handler New target handler.
+ */
+ public function mapHandler($mode, $handler)
+ {
+ $this->mode_handlers[$mode] = $handler;
+ }
+
+ /**
+ * Splits the page text into tokens.
+ *
+ * Will fail if the handlers report an error or if no content is consumed. If successful then each
+ * unparsed and parsed token invokes a call to the held listener.
+ *
+ * @param string $raw Raw HTML text.
+ * @return boolean True on success, else false.
+ */
+ public function parse($raw)
+ {
+ if (! isset($this->handler)) {
+ return false;
+ }
+ $initialLength = strlen($raw);
+ $length = $initialLength;
+ $pos = 0;
+ while (is_array($parsed = $this->reduce($raw))) {
+ list($unmatched, $matched, $mode) = $parsed;
+ $currentLength = strlen($raw);
+ $matchPos = $initialLength - $currentLength - strlen($matched);
+ if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
+ return false;
+ }
+ if ($currentLength == $length) {
+ return false;
+ }
+ $length = $currentLength;
+ $pos = $initialLength - $currentLength;
+ }
+ if (!$parsed) {
+ return false;
+ }
+ return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
+ }
+
+ /**
+ * Gives plugins access to the mode stack
+ *
+ * @return StateStack
+ */
+ public function getModeStack()
+ {
+ return $this->modeStack;
+ }
+
+ /**
+ * Sends the matched token and any leading unmatched
+ * text to the parser changing the lexer to a new
+ * mode if one is listed.
+ *
+ * @param string $unmatched Unmatched leading portion.
+ * @param string $matched Actual token match.
+ * @param bool|string $mode Mode after match. A boolean false mode causes no change.
+ * @param int $initialPos
+ * @param int $matchPos Current byte index location in raw doc thats being parsed
+ * @return boolean False if there was any error from the parser.
+ */
+ protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
+ {
+ if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
+ return false;
+ }
+ if ($this->isModeEnd($mode)) {
+ if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
+ return false;
+ }
+ return $this->modeStack->leave();
+ }
+ if ($this->isSpecialMode($mode)) {
+ $this->modeStack->enter($this->decodeSpecial($mode));
+ if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
+ return false;
+ }
+ return $this->modeStack->leave();
+ }
+ if (is_string($mode)) {
+ $this->modeStack->enter($mode);
+ return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
+ }
+ return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
+ }
+
+ /**
+ * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
+ * mode stack.
+ *
+ * @param string $mode Mode to test.
+ * @return boolean True if this is the exit mode.
+ */
+ protected function isModeEnd($mode)
+ {
+ return ($mode === "__exit");
+ }
+
+ /**
+ * Test to see if the mode is one where this mode is entered for this token only and automatically
+ * leaves immediately afterwoods.
+ *
+ * @param string $mode Mode to test.
+ * @return boolean True if this is the exit mode.
+ */
+ protected function isSpecialMode($mode)
+ {
+ return (strncmp($mode, "_", 1) == 0);
+ }
+
+ /**
+ * Strips the magic underscore marking single token modes.
+ *
+ * @param string $mode Mode to decode.
+ * @return string Underlying mode name.
+ */
+ protected function decodeSpecial($mode)
+ {
+ return substr($mode, 1);
+ }
+
+ /**
+ * Calls the parser method named after the current mode.
+ *
+ * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
+ *
+ * @param string $content Text parsed.
+ * @param boolean $is_match Token is recognised rather
+ * than unparsed data.
+ * @param int $pos Current byte index location in raw doc
+ * thats being parsed
+ * @return bool
+ */
+ protected function invokeHandler($content, $is_match, $pos)
+ {
+ if (($content === "") || ($content === false)) {
+ return true;
+ }
+ $handler = $this->modeStack->getCurrent();
+ if (isset($this->mode_handlers[$handler])) {
+ $handler = $this->mode_handlers[$handler];
+ }
+
+ // modes starting with plugin_ are all handled by the same
+ // handler but with an additional parameter
+ if (substr($handler, 0, 7)=='plugin_') {
+ list($handler,$plugin) = explode('_', $handler, 2);
+ return $this->handler->$handler($content, $is_match, $pos, $plugin);
+ }
+
+ return $this->handler->$handler($content, $is_match, $pos);
+ }
+
+ /**
+ * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
+ * unparsed data. Empty strings will not be matched.
+ *
+ * @param string $raw The subject to parse. This is the content that will be eaten.
+ * @return array|bool Three item list of unparsed content followed by the
+ * recognised token and finally the action the parser is to take.
+ * True if no match, false if there is a parsing error.
+ */
+ protected function reduce(&$raw)
+ {
+ if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
+ return false;
+ }
+ if ($raw === "") {
+ return true;
+ }
+ if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
+ list($unparsed, $match, $raw) = $split;
+ return array($unparsed, $match, $action);
+ }
+ return true;
+ }
+
+ /**
+ * Escapes regex characters other than (, ) and /
+ *
+ * @param string $str
+ * @return string
+ */
+ public static function escape($str)
+ {
+ $chars = array(
+ '/\\\\/',
+ '/\./',
+ '/\+/',
+ '/\*/',
+ '/\?/',
+ '/\[/',
+ '/\^/',
+ '/\]/',
+ '/\$/',
+ '/\{/',
+ '/\}/',
+ '/\=/',
+ '/\!/',
+ '/\</',
+ '/\>/',
+ '/\|/',
+ '/\:/'
+ );
+
+ $escaped = array(
+ '\\\\\\\\',
+ '\.',
+ '\+',
+ '\*',
+ '\?',
+ '\[',
+ '\^',
+ '\]',
+ '\$',
+ '\{',
+ '\}',
+ '\=',
+ '\!',
+ '\<',
+ '\>',
+ '\|',
+ '\:'
+ );
+ return preg_replace($chars, $escaped, $str);
+ }
+}
diff --git a/platform/www/inc/Parsing/Lexer/ParallelRegex.php b/platform/www/inc/Parsing/Lexer/ParallelRegex.php
new file mode 100644
index 0000000..96f61a1
--- /dev/null
+++ b/platform/www/inc/Parsing/Lexer/ParallelRegex.php
@@ -0,0 +1,203 @@
+<?php
+/**
+ * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
+ * For an intro to the Lexer see:
+ * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
+ *
+ * @author Marcus Baker http://www.lastcraft.com
+ */
+
+namespace dokuwiki\Parsing\Lexer;
+
+/**
+ * Compounded regular expression.
+ *
+ * Any of the contained patterns could match and when one does it's label is returned.
+ */
+class ParallelRegex
+{
+ /** @var string[] patterns to match */
+ protected $patterns;
+ /** @var string[] labels for above patterns */
+ protected $labels;
+ /** @var string the compound regex matching all patterns */
+ protected $regex;
+ /** @var bool case sensitive matching? */
+ protected $case;
+
+ /**
+ * Constructor. Starts with no patterns.
+ *
+ * @param boolean $case True for case sensitive, false
+ * for insensitive.
+ */
+ public function __construct($case)
+ {
+ $this->case = $case;
+ $this->patterns = array();
+ $this->labels = array();
+ $this->regex = null;
+ }
+
+ /**
+ * Adds a pattern with an optional label.
+ *
+ * @param mixed $pattern Perl style regex. Must be UTF-8
+ * encoded. If its a string, the (, )
+ * lose their meaning unless they
+ * form part of a lookahead or
+ * lookbehind assertation.
+ * @param bool|string $label Label of regex to be returned
+ * on a match. Label must be ASCII
+ */
+ public function addPattern($pattern, $label = true)
+ {
+ $count = count($this->patterns);
+ $this->patterns[$count] = $pattern;
+ $this->labels[$count] = $label;
+ $this->regex = null;
+ }
+
+ /**
+ * Attempts to match all patterns at once against a string.
+ *
+ * @param string $subject String to match against.
+ * @param string $match First matched portion of
+ * subject.
+ * @return bool|string False if no match found, label if label exists, true if not
+ */
+ public function match($subject, &$match)
+ {
+ if (count($this->patterns) == 0) {
+ return false;
+ }
+ if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
+ $match = "";
+ return false;
+ }
+
+ $match = $matches[0];
+ $size = count($matches);
+ // FIXME this could be made faster by storing the labels as keys in a hashmap
+ for ($i = 1; $i < $size; $i++) {
+ if ($matches[$i] && isset($this->labels[$i - 1])) {
+ return $this->labels[$i - 1];
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Attempts to split the string against all patterns at once
+ *
+ * @param string $subject String to match against.
+ * @param array $split The split result: array containing, pre-match, match & post-match strings
+ * @return boolean True on success.
+ *
+ * @author Christopher Smith <chris@jalakai.co.uk>
+ */
+ public function split($subject, &$split)
+ {
+ if (count($this->patterns) == 0) {
+ return false;
+ }
+
+ if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
+ if (function_exists('preg_last_error')) {
+ $err = preg_last_error();
+ switch ($err) {
+ case PREG_BACKTRACK_LIMIT_ERROR:
+ msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
+ break;
+ case PREG_RECURSION_LIMIT_ERROR:
+ msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
+ break;
+ case PREG_BAD_UTF8_ERROR:
+ msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
+ break;
+ case PREG_INTERNAL_ERROR:
+ msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
+ break;
+ }
+ }
+
+ $split = array($subject, "", "");
+ return false;
+ }
+
+ $idx = count($matches)-2;
+ list($pre, $post) = preg_split($this->patterns[$idx].$this->getPerlMatchingFlags(), $subject, 2);
+ $split = array($pre, $matches[0], $post);
+
+ return isset($this->labels[$idx]) ? $this->labels[$idx] : true;
+ }
+
+ /**
+ * Compounds the patterns into a single
+ * regular expression separated with the
+ * "or" operator. Caches the regex.
+ * Will automatically escape (, ) and / tokens.
+ *
+ * @return null|string
+ */
+ protected function getCompoundedRegex()
+ {
+ if ($this->regex == null) {
+ $cnt = count($this->patterns);
+ for ($i = 0; $i < $cnt; $i++) {
+ /*
+ * decompose the input pattern into "(", "(?", ")",
+ * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
+ * elements.
+ */
+ preg_match_all('/\\\\.|' .
+ '\(\?|' .
+ '[()]|' .
+ '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
+ '[^[()\\\\]+/', $this->patterns[$i], $elts);
+
+ $pattern = "";
+ $level = 0;
+
+ foreach ($elts[0] as $elt) {
+ /*
+ * for "(", ")" remember the nesting level, add "\"
+ * only to the non-"(?" ones.
+ */
+
+ switch ($elt) {
+ case '(':
+ $pattern .= '\(';
+ break;
+ case ')':
+ if ($level > 0)
+ $level--; /* closing (? */
+ else $pattern .= '\\';
+ $pattern .= ')';
+ break;
+ case '(?':
+ $level++;
+ $pattern .= '(?';
+ break;
+ default:
+ if (substr($elt, 0, 1) == '\\')
+ $pattern .= $elt;
+ else $pattern .= str_replace('/', '\/', $elt);
+ }
+ }
+ $this->patterns[$i] = "($pattern)";
+ }
+ $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags();
+ }
+ return $this->regex;
+ }
+
+ /**
+ * Accessor for perl regex mode flags to use.
+ * @return string Perl regex flags.
+ */
+ protected function getPerlMatchingFlags()
+ {
+ return ($this->case ? "msS" : "msSi");
+ }
+}
diff --git a/platform/www/inc/Parsing/Lexer/StateStack.php b/platform/www/inc/Parsing/Lexer/StateStack.php
new file mode 100644
index 0000000..325412b
--- /dev/null
+++ b/platform/www/inc/Parsing/Lexer/StateStack.php
@@ -0,0 +1,60 @@
+<?php
+/**
+ * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
+ * For an intro to the Lexer see:
+ * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
+ *
+ * @author Marcus Baker http://www.lastcraft.com
+ */
+
+namespace dokuwiki\Parsing\Lexer;
+
+/**
+ * States for a stack machine.
+ */
+class StateStack
+{
+ protected $stack;
+
+ /**
+ * Constructor. Starts in named state.
+ * @param string $start Starting state name.
+ */
+ public function __construct($start)
+ {
+ $this->stack = array($start);
+ }
+
+ /**
+ * Accessor for current state.
+ * @return string State.
+ */
+ public function getCurrent()
+ {
+ return $this->stack[count($this->stack) - 1];
+ }
+
+ /**
+ * Adds a state to the stack and sets it to be the current state.
+ *
+ * @param string $state New state.
+ */
+ public function enter($state)
+ {
+ array_push($this->stack, $state);
+ }
+
+ /**
+ * Leaves the current state and reverts
+ * to the previous one.
+ * @return boolean false if we attempt to drop off the bottom of the list.
+ */
+ public function leave()
+ {
+ if (count($this->stack) == 1) {
+ return false;
+ }
+ array_pop($this->stack);
+ return true;
+ }
+}