summaryrefslogtreecommitdiff
path: root/bin/wiki/ImportarDesdeURL/node_modules/franc/index.js
diff options
context:
space:
mode:
Diffstat (limited to 'bin/wiki/ImportarDesdeURL/node_modules/franc/index.js')
-rw-r--r--bin/wiki/ImportarDesdeURL/node_modules/franc/index.js313
1 files changed, 313 insertions, 0 deletions
diff --git a/bin/wiki/ImportarDesdeURL/node_modules/franc/index.js b/bin/wiki/ImportarDesdeURL/node_modules/franc/index.js
new file mode 100644
index 00000000..90f251ed
--- /dev/null
+++ b/bin/wiki/ImportarDesdeURL/node_modules/franc/index.js
@@ -0,0 +1,313 @@
+'use strict'
+
+/* Load `trigram-utils`. */
+var utilities = require('trigram-utils')
+
+/* Load `expressions` (regular expressions matching
+ * scripts). */
+var expressions = require('./expressions.js')
+
+/* Load `data` (trigram information per language,
+ * per script). */
+var data = require('./data.json')
+
+/* Expose `detectAll` on `detect`. */
+detect.all = detectAll
+
+/* Expose `detect`. */
+module.exports = detect
+
+/* Maximum sample length. */
+var MAX_LENGTH = 2048
+
+/* Minimum sample length. */
+var MIN_LENGTH = 10
+
+/* The maximum distance to add when a given trigram does
+ * not exist in a trigram dictionary. */
+var MAX_DIFFERENCE = 300
+
+/* Construct trigram dictionaries. */
+;(function() {
+ var languages
+ var name
+ var trigrams
+ var model
+ var script
+ var weight
+
+ for (script in data) {
+ languages = data[script]
+
+ for (name in languages) {
+ model = languages[name].split('|')
+
+ weight = model.length
+
+ trigrams = {}
+
+ while (weight--) {
+ trigrams[model[weight]] = weight
+ }
+
+ languages[name] = trigrams
+ }
+ }
+})()
+
+/**
+ * Get the most probable language for the given value.
+ *
+ * @param {string} value - The value to test.
+ * @param {Object} options - Configuration.
+ * @return {string} The most probable language.
+ */
+function detect(value, options) {
+ return detectAll(value, options)[0][0]
+}
+
+/**
+ * Get a list of probable languages the given value is
+ * written in.
+ *
+ * @param {string} value - The value to test.
+ * @param {Object} options - Configuration.
+ * @return {Array.<Array.<string, number>>} An array
+ * containing language--distance tuples.
+ */
+function detectAll(value, options) {
+ var settings = options || {}
+ var minLength = MIN_LENGTH
+ var only = [].concat(settings.whitelist || [], settings.only || [])
+ var ignore = [].concat(settings.blacklist || [], settings.ignore || [])
+ var script
+
+ if (settings.minLength !== null && settings.minLength !== undefined) {
+ minLength = settings.minLength
+ }
+
+ if (!value || value.length < minLength) {
+ return und()
+ }
+
+ value = value.substr(0, MAX_LENGTH)
+
+ /* Get the script which characters occur the most
+ * in `value`. */
+ script = getTopScript(value, expressions)
+
+ /* One languages exists for the most-used script. */
+ if (!(script[0] in data)) {
+ /* If no matches occured, such as a digit only string,
+ * or because the language is ignored, exit with `und`. */
+ if (script[1] === 0 || !allow(script[0], only, ignore)) {
+ return und()
+ }
+
+ return singleLanguageTuples(script[0])
+ }
+
+ /* Get all distances for a given script, and
+ * normalize the distance values. */
+ return normalize(
+ value,
+ getDistances(utilities.asTuples(value), data[script[0]], only, ignore)
+ )
+}
+
+/**
+ * Normalize the difference for each tuple in
+ * `distances`.
+ *
+ * @param {string} value - Value to normalize.
+ * @param {Array.<Array.<string, number>>} distances
+ * - List of distances.
+ * @return {Array.<Array.<string, number>>} - Normalized
+ * distances.
+ */
+function normalize(value, distances) {
+ var min = distances[0][1]
+ var max = value.length * MAX_DIFFERENCE - min
+ var index = -1
+ var length = distances.length
+
+ while (++index < length) {
+ distances[index][1] = 1 - (distances[index][1] - min) / max || 0
+ }
+
+ return distances
+}
+
+/**
+ * From `scripts`, get the most occurring expression for
+ * `value`.
+ *
+ * @param {string} value - Value to check.
+ * @param {Object.<RegExp>} scripts - Top-Scripts.
+ * @return {Array} Top script and its
+ * occurrence percentage.
+ */
+function getTopScript(value, scripts) {
+ var topCount = -1
+ var topScript
+ var script
+ var count
+
+ for (script in scripts) {
+ count = getOccurrence(value, scripts[script])
+
+ if (count > topCount) {
+ topCount = count
+ topScript = script
+ }
+ }
+
+ return [topScript, topCount]
+}
+
+/**
+ * Get the occurrence ratio of `expression` for `value`.
+ *
+ * @param {string} value - Value to check.
+ * @param {RegExp} expression - Code-point expression.
+ * @return {number} Float between 0 and 1.
+ */
+function getOccurrence(value, expression) {
+ var count = value.match(expression)
+
+ return (count ? count.length : 0) / value.length || 0
+}
+
+/**
+ * Get the distance between an array of trigram--count
+ * tuples, and multiple trigram dictionaries.
+ *
+ * @param {Array.<Array.<string, number>>} trigrams - An
+ * array containing trigram--count tuples.
+ * @param {Object.<Object>} languages - multiple
+ * trigrams to test against.
+ * @param {Array.<string>} only - Allowed languages; if
+ * non-empty, only included languages are kept.
+ * @param {Array.<string>} ignore - Disallowed languages;
+ * included languages are ignored.
+ * @return {Array.<Array.<string, number>>} An array
+ * containing language--distance tuples.
+ */
+function getDistances(trigrams, languages, only, ignore) {
+ var distances = []
+ var language
+
+ languages = filterLanguages(languages, only, ignore)
+
+ for (language in languages) {
+ distances.push([language, getDistance(trigrams, languages[language])])
+ }
+
+ return distances.length === 0 ? und() : distances.sort(sort)
+}
+
+/**
+ * Get the distance between an array of trigram--count
+ * tuples, and a language dictionary.
+ *
+ * @param {Array.<Array.<string, number>>} trigrams - An
+ * array containing trigram--count tuples.
+ * @param {Object.<number>} model - Object
+ * containing weighted trigrams.
+ * @return {number} - The distance between the two.
+ */
+function getDistance(trigrams, model) {
+ var distance = 0
+ var index = -1
+ var length = trigrams.length
+ var trigram
+ var difference
+
+ while (++index < length) {
+ trigram = trigrams[index]
+
+ if (trigram[0] in model) {
+ difference = trigram[1] - model[trigram[0]] - 1
+
+ if (difference < 0) {
+ difference = -difference
+ }
+ } else {
+ difference = MAX_DIFFERENCE
+ }
+
+ distance += difference
+ }
+
+ return distance
+}
+
+/**
+ * Filter `languages` by removing languages in
+ * `ignore`, or including languages in `only`.
+ *
+ * @param {Object.<Object>} languages - Languages
+ * to filter
+ * @param {Array.<string>} only - Allowed languages; if
+ * non-empty, only included languages are kept.
+ * @param {Array.<string>} ignore - Disallowed languages;
+ * included languages are ignored.
+ * @return {Object.<Object>} - Filtered array of
+ * languages.
+ */
+function filterLanguages(languages, only, ignore) {
+ var filteredLanguages
+ var language
+
+ if (only.length === 0 && ignore.length === 0) {
+ return languages
+ }
+
+ filteredLanguages = {}
+
+ for (language in languages) {
+ if (allow(language, only, ignore)) {
+ filteredLanguages[language] = languages[language]
+ }
+ }
+
+ return filteredLanguages
+}
+
+/**
+ * Check if `language` can match according to settings.
+ *
+ * @param {string} language - Languages
+ * to filter
+ * @param {Array.<string>} only - Allowed languages; if
+ * non-empty, only included languages are kept.
+ * @param {Array.<string>} ignore - Disallowed languages;
+ * included languages are ignored.
+ * @return {boolean} - Whether `language` can match
+ */
+function allow(language, only, ignore) {
+ if (only.length === 0 && ignore.length === 0) {
+ return true
+ }
+
+ return (
+ (only.length === 0 || only.indexOf(language) !== -1) &&
+ ignore.indexOf(language) === -1
+ )
+}
+
+/* Create a single `und` tuple. */
+function und() {
+ return singleLanguageTuples('und')
+}
+
+/* Create a single tuple as a list of tuples from a given
+ * language code. */
+function singleLanguageTuples(language) {
+ return [[language, 1]]
+}
+
+/* Deep regular sort on the number at `1` in both objects. */
+function sort(a, b) {
+ return a[1] - b[1]
+}