platform/www/inc/indexer.php


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369

<?php
/**
 * Functions to create the fulltext search index
 *
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
 * @author     Andreas Gohr <andi@splitbrain.org>
 * @author     Tom N Harris <tnharris@whoopdedo.org>
 */

use dokuwiki\Extension\Event;
use dokuwiki\Search\Indexer;

// Version tag used to force rebuild on upgrade
define('INDEXER_VERSION', 8);

// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);

/**
 * Version of the indexer taking into consideration the external tokenizer.
 * The indexer is only compatible with data written by the same version.
 *
 * @triggers INDEXER_VERSION_GET
 * Plugins that modify what gets indexed should hook this event and
 * add their version info to the event data like so:
 *     $data[$plugin_name] = $plugin_version;
 *
 * @author Tom N Harris <tnharris@whoopdedo.org>
 * @author Michael Hamann <michael@content-space.de>
 *
 * @return int|string
 */
function idx_get_version(){
    static $indexer_version = null;
    if ($indexer_version == null) {
        $version = INDEXER_VERSION;

        // DokuWiki version is included for the convenience of plugins
        $data = array('dokuwiki'=>$version);
        Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
        unset($data['dokuwiki']); // this needs to be first
        ksort($data);
        foreach ($data as $plugin=>$vers)
            $version .= '+'.$plugin.'='.$vers;
        $indexer_version = $version;
    }
    return $indexer_version;
}

/**
 * Measure the length of a string.
 * Differs from strlen in handling of asian characters.
 *
 * @author Tom N Harris <tnharris@whoopdedo.org>
 *
 * @param string $w
 * @return int
 */
function wordlen($w){
    $l = strlen($w);
    // If left alone, all chinese "words" will get put into w3.idx
    // So the "length" of a "word" is faked
    if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
        foreach($leadbytes[0] as $b)
            $l += ord($b) - 0xE1;
    }
    return $l;
}

/**
 * Create an instance of the indexer.
 *
 * @return Indexer    an Indexer
 *
 * @author Tom N Harris <tnharris@whoopdedo.org>
 */
function idx_get_indexer() {
    static $Indexer;
    if (!isset($Indexer)) {
        $Indexer = new Indexer();
    }
    return $Indexer;
}

/**
 * Returns words that will be ignored.
 *
 * @return array                list of stop words
 *
 * @author Tom N Harris <tnharris@whoopdedo.org>
 */
function & idx_get_stopwords() {
    static $stopwords = null;
    if (is_null($stopwords)) {
        global $conf;
        $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
        if(file_exists($swfile)){
            $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
        }else{
            $stopwords = array();
        }
    }
    return $stopwords;
}

/**
 * Adds/updates the search index for the given page
 *
 * Locking is handled internally.
 *
 * @param string        $page   name of the page to index
 * @param boolean       $verbose    print status messages
 * @param boolean       $force  force reindexing even when the index is up to date
 * @return string|boolean  the function completed successfully
 *
 * @author Tom N Harris <tnharris@whoopdedo.org>
 */
function idx_addPage($page, $verbose=false, $force=false) {
    $idxtag = metaFN($page,'.indexed');
    // check if page was deleted but is still in the index
    if (!page_exists($page)) {
        if (!file_exists($idxtag)) {
            if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
            return false;
        }
        $Indexer = idx_get_indexer();
        $result = $Indexer->deletePage($page);
        if ($result === "locked") {
            if ($verbose) print("Indexer: locked".DOKU_LF);
            return false;
        }
        @unlink($idxtag);
        return $result;
    }

    // check if indexing needed
    if(!$force && file_exists($idxtag)){
        if(trim(io_readFile($idxtag)) == idx_get_version()){
            $last = @filemtime($idxtag);
            if($last > @filemtime(wikiFN($page))){
                if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
                return false;
            }
        }
    }

    $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
    if ($indexenabled === false) {
        $result = false;
        if (file_exists($idxtag)) {
            $Indexer = idx_get_indexer();
            $result = $Indexer->deletePage($page);
            if ($result === "locked") {
                if ($verbose) print("Indexer: locked".DOKU_LF);
                return false;
            }
            @unlink($idxtag);
        }
        if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
        return $result;
    }

    $Indexer = idx_get_indexer();
    $pid = $Indexer->getPID($page);
    if ($pid === false) {
        if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
        return false;
    }
    $body = '';
    $metadata = array();
    $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
    if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
        $metadata['relation_references'] = array_keys($references);
    else
        $metadata['relation_references'] = array();

    if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
        $metadata['relation_media'] = array_keys($media);
    else
        $metadata['relation_media'] = array();

    $data = compact('page', 'body', 'metadata', 'pid');
    $evt = new Event('INDEXER_PAGE_ADD', $data);
    if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
    $evt->advise_after();
    unset($evt);
    extract($data);

    $result = $Indexer->addPageWords($page, $body);
    if ($result === "locked") {
        if ($verbose) print("Indexer: locked".DOKU_LF);
        return false;
    }

    if ($result) {
        $result = $Indexer->addMetaKeys($page, $metadata);
        if ($result === "locked") {
            if ($verbose) print("Indexer: locked".DOKU_LF);
            return false;
        }
    }

    if ($result)
        io_saveFile(metaFN($page,'.indexed'), idx_get_version());
    if ($verbose) {
        print("Indexer: finished".DOKU_LF);
        return true;
    }
    return $result;
}

/**
 * Find tokens in the fulltext index
 *
 * Takes an array of words and will return a list of matching
 * pages for each one.
 *
 * Important: No ACL checking is done here! All results are
 *            returned, regardless of permissions
 *
 * @param array      $words  list of words to search for
 * @return array             list of pages found, associated with the search terms
 */
function idx_lookup(&$words) {
    $Indexer = idx_get_indexer();
    return $Indexer->lookup($words);
}

/**
 * Split a string into tokens
 *
 * @param string $string
 * @param bool $wc
 *
 * @return array
 */
function idx_tokenizer($string, $wc=false) {
    $Indexer = idx_get_indexer();
    return $Indexer->tokenizer($string, $wc);
}

/* For compatibility */

/**
 * Read the list of words in an index (if it exists).
 *
 * @author Tom N Harris <tnharris@whoopdedo.org>
 *
 * @param string $idx
 * @param string $suffix
 * @return array
 */
function idx_getIndex($idx, $suffix) {
    global $conf;
    $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
    if (!file_exists($fn)) return array();
    return file($fn);
}

/**
 * Get the list of lengths indexed in the wiki.
 *
 * Read the index directory or a cache file and returns
 * a sorted array of lengths of the words used in the wiki.
 *
 * @author YoBoY <yoboy.leguesh@gmail.com>
 *
 * @return array
 */
function idx_listIndexLengths() {
    global $conf;
    // testing what we have to do, create a cache file or not.
    if ($conf['readdircache'] == 0) {
        $docache = false;
    } else {
        clearstatcache();
        if (file_exists($conf['indexdir'].'/lengths.idx')
        && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
            if (
                ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
                !== false
            ) {
                $idx = array();
                foreach ($lengths as $length) {
                    $idx[] = (int)$length;
                }
                return $idx;
            }
        }
        $docache = true;
    }

    if ($conf['readdircache'] == 0 || $docache) {
        $dir = @opendir($conf['indexdir']);
        if ($dir === false)
            return array();
        $idx = array();
        while (($f = readdir($dir)) !== false) {
            if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
                $i = substr($f, 1, -4);
                if (is_numeric($i))
                    $idx[] = (int)$i;
            }
        }
        closedir($dir);
        sort($idx);
        // save this in a file
        if ($docache) {
            $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
            @fwrite($handle, implode("\n", $idx));
            @fclose($handle);
        }
        return $idx;
    }

    return array();
}

/**
 * Get the word lengths that have been indexed.
 *
 * Reads the index directory and returns an array of lengths
 * that there are indices for.
 *
 * @author YoBoY <yoboy.leguesh@gmail.com>
 *
 * @param array|int $filter
 * @return array
 */
function idx_indexLengths($filter) {
    global $conf;
    $idx = array();
    if (is_array($filter)) {
        // testing if index files exist only
        $path = $conf['indexdir']."/i";
        foreach ($filter as $key => $value) {
            if (file_exists($path.$key.'.idx'))
                $idx[] = $key;
        }
    } else {
        $lengths = idx_listIndexLengths();
        foreach ($lengths as $key => $length) {
            // keep all the values equal or superior
            if ((int)$length >= (int)$filter)
                $idx[] = $length;
        }
    }
    return $idx;
}

/**
 * Clean a name of a key for use as a file name.
 *
 * Romanizes non-latin characters, then strips away anything that's
 * not a letter, number, or underscore.
 *
 * @author Tom N Harris <tnharris@whoopdedo.org>
 *
 * @param string $name
 * @return string
 */
function idx_cleanName($name) {
    $name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name));
    $name = preg_replace('#[ \./\\:-]+#', '_', $name);
    $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
    return strtolower($name);
}

//Setup VIM: ex: et ts=4 :