summaryrefslogtreecommitdiff
path: root/www/wiki/resources/src/mediawiki/mediawiki.String.js
blob: 5d9bef06327f7d82d822ebd2038b154bccedd27d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
( function () {

	/**
	 * @class mw.String
	 * @singleton
	 */

	/**
	 * Calculate the byte length of a string (accounting for UTF-8).
	 *
	 * @author Jan Paul Posma, 2011
	 * @author Timo Tijhof, 2012
	 * @author David Chan, 2013
	 *
	 * @param {string} str
	 * @return {number}
	 */
	function byteLength( str ) {
		// This basically figures out how many bytes a UTF-16 string (which is what js sees)
		// will take in UTF-8 by replacing a 2 byte character with 2 *'s, etc, and counting that.
		// Note, surrogate (\uD800-\uDFFF) characters are counted as 2 bytes, since there's two of them
		// and the actual character takes 4 bytes in UTF-8 (2*2=4). Might not work perfectly in
		// edge cases such as illegal sequences, but that should never happen.

		// https://en.wikipedia.org/wiki/UTF-8#Description
		// The mapping from UTF-16 code units to UTF-8 bytes is as follows:
		// > Range 0000-007F: codepoints that become 1 byte of UTF-8
		// > Range 0080-07FF: codepoints that become 2 bytes of UTF-8
		// > Range 0800-D7FF: codepoints that become 3 bytes of UTF-8
		// > Range D800-DFFF: Surrogates (each pair becomes 4 bytes of UTF-8)
		// > Range E000-FFFF: codepoints that become 3 bytes of UTF-8 (continued)

		return str
			.replace( /[\u0080-\u07FF\uD800-\uDFFF]/g, '**' )
			.replace( /[\u0800-\uD7FF\uE000-\uFFFF]/g, '***' )
			.length;
	}

	/**
	 * Calculate the character length of a string (accounting for UTF-16 surrogates).
	 *
	 * @param {string} str
	 * @return {number}
	 */
	function codePointLength( str ) {
		return str
			// Low surrogate + high surrogate pairs represent one character (codepoint) each
			.replace( /[\uD800-\uDBFF][\uDC00-\uDFFF]/g, '*' )
			.length;
	}

	// Like String#charAt, but return the pair of UTF-16 surrogates for characters outside of BMP.
	function codePointAt( string, offset, backwards ) {
		// We don't need to check for offsets at the beginning or end of string,
		// String#slice will simply return a shorter (or empty) substring.
		var maybePair = backwards ?
			string.slice( offset - 1, offset + 1 ) :
			string.slice( offset, offset + 2 );
		if ( /^[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( maybePair ) ) {
			return maybePair;
		} else {
			return string.charAt( offset );
		}
	}

	function trimLength( safeVal, newVal, length, lengthFn ) {
		var startMatches, endMatches, matchesLen, inpParts, chopOff, oldChar, newChar,
			oldVal = safeVal;

		// Run the hook if one was provided, but only on the length
		// assessment. The value itself is not to be affected by the hook.
		if ( lengthFn( newVal ) <= length ) {
			// Limit was not reached, just remember the new value
			// and let the user continue.
			return {
				newVal: newVal,
				trimmed: false
			};
		}

		// Current input is longer than the active limit.
		// Figure out what was added and limit the addition.
		startMatches = 0;
		endMatches = 0;

		// It is important that we keep the search within the range of
		// the shortest string's length.
		// Imagine a user adds text that matches the end of the old value
		// (e.g. "foo" -> "foofoo"). startMatches would be 3, but without
		// limiting both searches to the shortest length, endMatches would
		// also be 3.
		matchesLen = Math.min( newVal.length, oldVal.length );

		// Count same characters from the left, first.
		// (if "foo" -> "foofoo", assume addition was at the end).
		while ( startMatches < matchesLen ) {
			oldChar = codePointAt( oldVal, startMatches, false );
			newChar = codePointAt( newVal, startMatches, false );
			if ( oldChar !== newChar ) {
				break;
			}
			startMatches += oldChar.length;
		}

		while ( endMatches < ( matchesLen - startMatches ) ) {
			oldChar = codePointAt( oldVal, oldVal.length - 1 - endMatches, true );
			newChar = codePointAt( newVal, newVal.length - 1 - endMatches, true );
			if ( oldChar !== newChar ) {
				break;
			}
			endMatches += oldChar.length;
		}

		inpParts = [
			// Same start
			newVal.slice( 0, startMatches ),
			// Inserted content
			newVal.slice( startMatches, newVal.length - endMatches ),
			// Same end
			newVal.slice( newVal.length - endMatches )
		];

		// Chop off characters from the end of the "inserted content" string
		// until the limit is statisfied.
		// Make sure to stop when there is nothing to slice (T43450).
		while ( lengthFn( inpParts.join( '' ) ) > length && inpParts[ 1 ].length > 0 ) {
			// Do not chop off halves of surrogate pairs
			chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1;
			inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff );
		}

		return {
			newVal: inpParts.join( '' ),
			// For pathological lengthFn() that always returns a length greater than the limit, we might have
			// ended up not trimming - check for this case to avoid infinite loops
			trimmed: newVal !== inpParts.join( '' )
		};
	}

	/**
	 * Utility function to trim down a string, based on byteLimit
	 * and given a safe start position. It supports insertion anywhere
	 * in the string, so "foo" to "fobaro" if limit is 4 will result in
	 * "fobo", not "foba". Basically emulating the native maxlength by
	 * reconstructing where the insertion occurred.
	 *
	 * @param {string} safeVal Known value that was previously returned by this
	 * function, if none, pass empty string.
	 * @param {string} newVal New value that may have to be trimmed down.
	 * @param {number} byteLimit Number of bytes the value may be in size.
	 * @param {Function} [filterFn] Function to call on the string before assessing the length.
	 * @return {Object}
	 * @return {string} return.newVal
	 * @return {boolean} return.trimmed
	 */
	function trimByteLength( safeVal, newVal, byteLimit, filterFn ) {
		var lengthFn;
		if ( filterFn ) {
			lengthFn = function ( val ) {
				return byteLength( filterFn( val ) );
			};
		} else {
			lengthFn = byteLength;
		}

		return trimLength( safeVal, newVal, byteLimit, lengthFn );
	}

	/**
	 * Utility function to trim down a string, based on codePointLimit
	 * and given a safe start position. It supports insertion anywhere
	 * in the string, so "foo" to "fobaro" if limit is 4 will result in
	 * "fobo", not "foba". Basically emulating the native maxlength by
	 * reconstructing where the insertion occurred.
	 *
	 * @param {string} safeVal Known value that was previously returned by this
	 * function, if none, pass empty string.
	 * @param {string} newVal New value that may have to be trimmed down.
	 * @param {number} codePointLimit Number of characters the value may be in size.
	 * @param {Function} [filterFn] Function to call on the string before assessing the length.
	 * @return {Object}
	 * @return {string} return.newVal
	 * @return {boolean} return.trimmed
	 */
	function trimCodePointLength( safeVal, newVal, codePointLimit, filterFn ) {
		var lengthFn;
		if ( filterFn ) {
			lengthFn = function ( val ) {
				return codePointLength( filterFn( val ) );
			};
		} else {
			lengthFn = codePointLength;
		}

		return trimLength( safeVal, newVal, codePointLimit, lengthFn );
	}

	module.exports = {
		byteLength: byteLength,
		codePointLength: codePointLength,
		trimByteLength: trimByteLength,
		trimCodePointLength: trimCodePointLength
	};

}() );