blob: e8e26b3fd8db63618bd58212cb43519a3c75cbce (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
'use strict'
var trigram = require('n-gram').trigram
var collapse = require('collapse-white-space')
var trim = require('trim')
var has = {}.hasOwnProperty
exports.clean = clean
exports.trigrams = getCleanTrigrams
exports.asDictionary = getCleanTrigramsAsDictionary
exports.asTuples = getCleanTrigramsAsTuples
exports.tuplesAsDictionary = getCleanTrigramTuplesAsDictionary
// Clean `value`/
// Removed general non-important (as in, for language detection) punctuation
// marks, symbols, and numbers.
function clean(value) {
if (value === null || value === undefined) {
return ''
}
return trim(
collapse(String(value).replace(/[\u0021-\u0040]+/g, ' '))
).toLowerCase()
}
// Get clean, padded, trigrams.
function getCleanTrigrams(value) {
return trigram(' ' + clean(value) + ' ')
}
// Get an `Object` with trigrams as its attributes, and their occurence count as
// their values.
function getCleanTrigramsAsDictionary(value) {
var trigrams = getCleanTrigrams(value)
var index = trigrams.length
var dictionary = {}
var trigram
while (index--) {
trigram = trigrams[index]
if (has.call(dictionary, trigram)) {
dictionary[trigram]++
} else {
dictionary[trigram] = 1
}
}
return dictionary
}
// Get an `Array` containing trigram--count tuples from a given value.
function getCleanTrigramsAsTuples(value) {
var dictionary = getCleanTrigramsAsDictionary(value)
var tuples = []
var trigram
for (trigram in dictionary) {
tuples.push([trigram, dictionary[trigram]])
}
tuples.sort(sort)
return tuples
}
// Get an `Array` containing trigram--count tuples from a given value.
function getCleanTrigramTuplesAsDictionary(tuples) {
var index = tuples.length
var dictionary = {}
var tuple
while (index--) {
tuple = tuples[index]
dictionary[tuple[0]] = tuple[1]
}
return dictionary
}
// Deep regular sort on item at `1` in both `Object`s.
function sort(a, b) {
return a[1] - b[1]
}
|