* @author Tom N Harris */ use dokuwiki\Extension\Event; use dokuwiki\Search\Indexer; // Version tag used to force rebuild on upgrade define('INDEXER_VERSION', 8); // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); /** * Version of the indexer taking into consideration the external tokenizer. * The indexer is only compatible with data written by the same version. * * @triggers INDEXER_VERSION_GET * Plugins that modify what gets indexed should hook this event and * add their version info to the event data like so: * $data[$plugin_name] = $plugin_version; * * @author Tom N Harris * @author Michael Hamann * * @return int|string */ function idx_get_version(){ static $indexer_version = null; if ($indexer_version == null) { $version = INDEXER_VERSION; // DokuWiki version is included for the convenience of plugins $data = array('dokuwiki'=>$version); Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); unset($data['dokuwiki']); // this needs to be first ksort($data); foreach ($data as $plugin=>$vers) $version .= '+'.$plugin.'='.$vers; $indexer_version = $version; } return $indexer_version; } /** * Measure the length of a string. * Differs from strlen in handling of asian characters. * * @author Tom N Harris * * @param string $w * @return int */ function wordlen($w){ $l = strlen($w); // If left alone, all chinese "words" will get put into w3.idx // So the "length" of a "word" is faked if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { foreach($leadbytes[0] as $b) $l += ord($b) - 0xE1; } return $l; } /** * Create an instance of the indexer. * * @return Indexer an Indexer * * @author Tom N Harris */ function idx_get_indexer() { static $Indexer; if (!isset($Indexer)) { $Indexer = new Indexer(); } return $Indexer; } /** * Returns words that will be ignored. * * @return array list of stop words * * @author Tom N Harris */ function & idx_get_stopwords() { static $stopwords = null; if (is_null($stopwords)) { global $conf; $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; if(file_exists($swfile)){ $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); }else{ $stopwords = array(); } } return $stopwords; } /** * Adds/updates the search index for the given page * * Locking is handled internally. * * @param string $page name of the page to index * @param boolean $verbose print status messages * @param boolean $force force reindexing even when the index is up to date * @return string|boolean the function completed successfully * * @author Tom N Harris */ function idx_addPage($page, $verbose=false, $force=false) { $idxtag = metaFN($page,'.indexed'); // check if page was deleted but is still in the index if (!page_exists($page)) { if (!file_exists($idxtag)) { if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); return false; } $Indexer = idx_get_indexer(); $result = $Indexer->deletePage($page); if ($result === "locked") { if ($verbose) print("Indexer: locked".DOKU_LF); return false; } @unlink($idxtag); return $result; } // check if indexing needed if(!$force && file_exists($idxtag)){ if(trim(io_readFile($idxtag)) == idx_get_version()){ $last = @filemtime($idxtag); if($last > @filemtime(wikiFN($page))){ if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); return false; } } } $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); if ($indexenabled === false) { $result = false; if (file_exists($idxtag)) { $Indexer = idx_get_indexer(); $result = $Indexer->deletePage($page); if ($result === "locked") { if ($verbose) print("Indexer: locked".DOKU_LF); return false; } @unlink($idxtag); } if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); return $result; } $Indexer = idx_get_indexer(); $pid = $Indexer->getPID($page); if ($pid === false) { if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); return false; } $body = ''; $metadata = array(); $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null) $metadata['relation_references'] = array_keys($references); else $metadata['relation_references'] = array(); if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null) $metadata['relation_media'] = array_keys($media); else $metadata['relation_media'] = array(); $data = compact('page', 'body', 'metadata', 'pid'); $evt = new Event('INDEXER_PAGE_ADD', $data); if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); $evt->advise_after(); unset($evt); extract($data); $result = $Indexer->addPageWords($page, $body); if ($result === "locked") { if ($verbose) print("Indexer: locked".DOKU_LF); return false; } if ($result) { $result = $Indexer->addMetaKeys($page, $metadata); if ($result === "locked") { if ($verbose) print("Indexer: locked".DOKU_LF); return false; } } if ($result) io_saveFile(metaFN($page,'.indexed'), idx_get_version()); if ($verbose) { print("Indexer: finished".DOKU_LF); return true; } return $result; } /** * Find tokens in the fulltext index * * Takes an array of words and will return a list of matching * pages for each one. * * Important: No ACL checking is done here! All results are * returned, regardless of permissions * * @param array $words list of words to search for * @return array list of pages found, associated with the search terms */ function idx_lookup(&$words) { $Indexer = idx_get_indexer(); return $Indexer->lookup($words); } /** * Split a string into tokens * * @param string $string * @param bool $wc * * @return array */ function idx_tokenizer($string, $wc=false) { $Indexer = idx_get_indexer(); return $Indexer->tokenizer($string, $wc); } /* For compatibility */ /** * Read the list of words in an index (if it exists). * * @author Tom N Harris * * @param string $idx * @param string $suffix * @return array */ function idx_getIndex($idx, $suffix) { global $conf; $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; if (!file_exists($fn)) return array(); return file($fn); } /** * Get the list of lengths indexed in the wiki. * * Read the index directory or a cache file and returns * a sorted array of lengths of the words used in the wiki. * * @author YoBoY * * @return array */ function idx_listIndexLengths() { global $conf; // testing what we have to do, create a cache file or not. if ($conf['readdircache'] == 0) { $docache = false; } else { clearstatcache(); if (file_exists($conf['indexdir'].'/lengths.idx') && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { if ( ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) !== false ) { $idx = array(); foreach ($lengths as $length) { $idx[] = (int)$length; } return $idx; } } $docache = true; } if ($conf['readdircache'] == 0 || $docache) { $dir = @opendir($conf['indexdir']); if ($dir === false) return array(); $idx = array(); while (($f = readdir($dir)) !== false) { if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { $i = substr($f, 1, -4); if (is_numeric($i)) $idx[] = (int)$i; } } closedir($dir); sort($idx); // save this in a file if ($docache) { $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); @fwrite($handle, implode("\n", $idx)); @fclose($handle); } return $idx; } return array(); } /** * Get the word lengths that have been indexed. * * Reads the index directory and returns an array of lengths * that there are indices for. * * @author YoBoY * * @param array|int $filter * @return array */ function idx_indexLengths($filter) { global $conf; $idx = array(); if (is_array($filter)) { // testing if index files exist only $path = $conf['indexdir']."/i"; foreach ($filter as $key => $value) { if (file_exists($path.$key.'.idx')) $idx[] = $key; } } else { $lengths = idx_listIndexLengths(); foreach ($lengths as $key => $length) { // keep all the values equal or superior if ((int)$length >= (int)$filter) $idx[] = $length; } } return $idx; } /** * Clean a name of a key for use as a file name. * * Romanizes non-latin characters, then strips away anything that's * not a letter, number, or underscore. * * @author Tom N Harris * * @param string $name * @return string */ function idx_cleanName($name) { $name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name)); $name = preg_replace('#[ \./\\:-]+#', '_', $name); $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); return strtolower($name); } //Setup VIM: ex: et ts=4 :