diff options
Diffstat (limited to 'platform/www/inc/indexer.php')
-rw-r--r-- | platform/www/inc/indexer.php | 369 |
1 files changed, 369 insertions, 0 deletions
diff --git a/platform/www/inc/indexer.php b/platform/www/inc/indexer.php new file mode 100644 index 0000000..ab02b8e --- /dev/null +++ b/platform/www/inc/indexer.php @@ -0,0 +1,369 @@ +<?php +/** + * Functions to create the fulltext search index + * + * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) + * @author Andreas Gohr <andi@splitbrain.org> + * @author Tom N Harris <tnharris@whoopdedo.org> + */ + +use dokuwiki\Extension\Event; +use dokuwiki\Search\Indexer; + +// Version tag used to force rebuild on upgrade +define('INDEXER_VERSION', 8); + +// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) +if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); + +/** + * Version of the indexer taking into consideration the external tokenizer. + * The indexer is only compatible with data written by the same version. + * + * @triggers INDEXER_VERSION_GET + * Plugins that modify what gets indexed should hook this event and + * add their version info to the event data like so: + * $data[$plugin_name] = $plugin_version; + * + * @author Tom N Harris <tnharris@whoopdedo.org> + * @author Michael Hamann <michael@content-space.de> + * + * @return int|string + */ +function idx_get_version(){ + static $indexer_version = null; + if ($indexer_version == null) { + $version = INDEXER_VERSION; + + // DokuWiki version is included for the convenience of plugins + $data = array('dokuwiki'=>$version); + Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); + unset($data['dokuwiki']); // this needs to be first + ksort($data); + foreach ($data as $plugin=>$vers) + $version .= '+'.$plugin.'='.$vers; + $indexer_version = $version; + } + return $indexer_version; +} + +/** + * Measure the length of a string. + * Differs from strlen in handling of asian characters. + * + * @author Tom N Harris <tnharris@whoopdedo.org> + * + * @param string $w + * @return int + */ +function wordlen($w){ + $l = strlen($w); + // If left alone, all chinese "words" will get put into w3.idx + // So the "length" of a "word" is faked + if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { + foreach($leadbytes[0] as $b) + $l += ord($b) - 0xE1; + } + return $l; +} + +/** + * Create an instance of the indexer. + * + * @return Indexer an Indexer + * + * @author Tom N Harris <tnharris@whoopdedo.org> + */ +function idx_get_indexer() { + static $Indexer; + if (!isset($Indexer)) { + $Indexer = new Indexer(); + } + return $Indexer; +} + +/** + * Returns words that will be ignored. + * + * @return array list of stop words + * + * @author Tom N Harris <tnharris@whoopdedo.org> + */ +function & idx_get_stopwords() { + static $stopwords = null; + if (is_null($stopwords)) { + global $conf; + $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; + if(file_exists($swfile)){ + $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); + }else{ + $stopwords = array(); + } + } + return $stopwords; +} + +/** + * Adds/updates the search index for the given page + * + * Locking is handled internally. + * + * @param string $page name of the page to index + * @param boolean $verbose print status messages + * @param boolean $force force reindexing even when the index is up to date + * @return string|boolean the function completed successfully + * + * @author Tom N Harris <tnharris@whoopdedo.org> + */ +function idx_addPage($page, $verbose=false, $force=false) { + $idxtag = metaFN($page,'.indexed'); + // check if page was deleted but is still in the index + if (!page_exists($page)) { + if (!file_exists($idxtag)) { + if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); + return false; + } + $Indexer = idx_get_indexer(); + $result = $Indexer->deletePage($page); + if ($result === "locked") { + if ($verbose) print("Indexer: locked".DOKU_LF); + return false; + } + @unlink($idxtag); + return $result; + } + + // check if indexing needed + if(!$force && file_exists($idxtag)){ + if(trim(io_readFile($idxtag)) == idx_get_version()){ + $last = @filemtime($idxtag); + if($last > @filemtime(wikiFN($page))){ + if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); + return false; + } + } + } + + $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); + if ($indexenabled === false) { + $result = false; + if (file_exists($idxtag)) { + $Indexer = idx_get_indexer(); + $result = $Indexer->deletePage($page); + if ($result === "locked") { + if ($verbose) print("Indexer: locked".DOKU_LF); + return false; + } + @unlink($idxtag); + } + if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); + return $result; + } + + $Indexer = idx_get_indexer(); + $pid = $Indexer->getPID($page); + if ($pid === false) { + if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); + return false; + } + $body = ''; + $metadata = array(); + $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); + if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null) + $metadata['relation_references'] = array_keys($references); + else + $metadata['relation_references'] = array(); + + if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null) + $metadata['relation_media'] = array_keys($media); + else + $metadata['relation_media'] = array(); + + $data = compact('page', 'body', 'metadata', 'pid'); + $evt = new Event('INDEXER_PAGE_ADD', $data); + if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); + $evt->advise_after(); + unset($evt); + extract($data); + + $result = $Indexer->addPageWords($page, $body); + if ($result === "locked") { + if ($verbose) print("Indexer: locked".DOKU_LF); + return false; + } + + if ($result) { + $result = $Indexer->addMetaKeys($page, $metadata); + if ($result === "locked") { + if ($verbose) print("Indexer: locked".DOKU_LF); + return false; + } + } + + if ($result) + io_saveFile(metaFN($page,'.indexed'), idx_get_version()); + if ($verbose) { + print("Indexer: finished".DOKU_LF); + return true; + } + return $result; +} + +/** + * Find tokens in the fulltext index + * + * Takes an array of words and will return a list of matching + * pages for each one. + * + * Important: No ACL checking is done here! All results are + * returned, regardless of permissions + * + * @param array $words list of words to search for + * @return array list of pages found, associated with the search terms + */ +function idx_lookup(&$words) { + $Indexer = idx_get_indexer(); + return $Indexer->lookup($words); +} + +/** + * Split a string into tokens + * + * @param string $string + * @param bool $wc + * + * @return array + */ +function idx_tokenizer($string, $wc=false) { + $Indexer = idx_get_indexer(); + return $Indexer->tokenizer($string, $wc); +} + +/* For compatibility */ + +/** + * Read the list of words in an index (if it exists). + * + * @author Tom N Harris <tnharris@whoopdedo.org> + * + * @param string $idx + * @param string $suffix + * @return array + */ +function idx_getIndex($idx, $suffix) { + global $conf; + $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; + if (!file_exists($fn)) return array(); + return file($fn); +} + +/** + * Get the list of lengths indexed in the wiki. + * + * Read the index directory or a cache file and returns + * a sorted array of lengths of the words used in the wiki. + * + * @author YoBoY <yoboy.leguesh@gmail.com> + * + * @return array + */ +function idx_listIndexLengths() { + global $conf; + // testing what we have to do, create a cache file or not. + if ($conf['readdircache'] == 0) { + $docache = false; + } else { + clearstatcache(); + if (file_exists($conf['indexdir'].'/lengths.idx') + && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { + if ( + ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) + !== false + ) { + $idx = array(); + foreach ($lengths as $length) { + $idx[] = (int)$length; + } + return $idx; + } + } + $docache = true; + } + + if ($conf['readdircache'] == 0 || $docache) { + $dir = @opendir($conf['indexdir']); + if ($dir === false) + return array(); + $idx = array(); + while (($f = readdir($dir)) !== false) { + if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { + $i = substr($f, 1, -4); + if (is_numeric($i)) + $idx[] = (int)$i; + } + } + closedir($dir); + sort($idx); + // save this in a file + if ($docache) { + $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); + @fwrite($handle, implode("\n", $idx)); + @fclose($handle); + } + return $idx; + } + + return array(); +} + +/** + * Get the word lengths that have been indexed. + * + * Reads the index directory and returns an array of lengths + * that there are indices for. + * + * @author YoBoY <yoboy.leguesh@gmail.com> + * + * @param array|int $filter + * @return array + */ +function idx_indexLengths($filter) { + global $conf; + $idx = array(); + if (is_array($filter)) { + // testing if index files exist only + $path = $conf['indexdir']."/i"; + foreach ($filter as $key => $value) { + if (file_exists($path.$key.'.idx')) + $idx[] = $key; + } + } else { + $lengths = idx_listIndexLengths(); + foreach ($lengths as $key => $length) { + // keep all the values equal or superior + if ((int)$length >= (int)$filter) + $idx[] = $length; + } + } + return $idx; +} + +/** + * Clean a name of a key for use as a file name. + * + * Romanizes non-latin characters, then strips away anything that's + * not a letter, number, or underscore. + * + * @author Tom N Harris <tnharris@whoopdedo.org> + * + * @param string $name + * @return string + */ +function idx_cleanName($name) { + $name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name)); + $name = preg_replace('#[ \./\\:-]+#', '_', $name); + $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); + return strtolower($name); +} + +//Setup VIM: ex: et ts=4 : |