URLHelpers.cpp source code [jsc/Source/WTF/wtf/URLHelpers.cpp]

1	/*
2	* Copyright (C) 2005-2019 Apple Inc. All rights reserved.
3	* Copyright (C) 2018 Igalia S.L.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	*
9	* 1. Redistributions of source code must retain the above copyright
10	* notice, this list of conditions and the following disclaimer.
11	* 2. Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	* 3. Neither the name of Apple Inc. ("Apple") nor the names of
15	* its contributors may be used to endorse or promote products derived
16	* from this software without specific prior written permission.
17	*
18	* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
19	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21	* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
22	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28	*/
29
30	#include "config.h"
31	#include "URLHelpers.h"
32
33	#include "URLParser.h"
34	#include <mutex>
35	#include <unicode/uidna.h>
36	#include <unicode/uscript.h>
37	#include <wtf/Optional.h>
38	#include <wtf/text/WTFString.h>
39
40	namespace WTF {
41	namespace URLHelpers {
42
43	// Needs to be big enough to hold an IDN-encoded name.
44	// For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
45	const unsigned hostNameBufferLength = `2048`;
46	const unsigned urlBytesBufferLength = `2048`;
47
48	static uint32_t IDNScriptWhiteList[(USCRIPT_CODE_LIMIT + `31`) / `32`];
49
50	#if !PLATFORM(COCOA)
51
52	// Cocoa has an implementation that uses a whitelist in /Library or ~/Library,
53	// if it exists.
54	void loadIDNScriptWhiteList()
55	{
56	static std::once_flag flag;
57	std::call_once(flag, initializeDefaultIDNScriptWhiteList);
58	}
59
60	#endif // !PLATFORM(COCOA)
61
62	static bool isArmenianLookalikeCharacter(UChar32 codePoint)
63	{
64	return codePoint == `0x0548` \|\| codePoint == `0x054D` \|\| codePoint == `0x0578` \|\| codePoint == `0x057D`;
65	}
66
67	static bool isArmenianScriptCharacter(UChar32 codePoint)
68	{
69	UErrorCode error = U_ZERO_ERROR;
70	UScriptCode script = uscript_getScript(codePoint, &error);
71	if (error != U_ZERO_ERROR) {
72	LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
73	return false;
74	}
75
76	return script == USCRIPT_ARMENIAN;
77	}
78
79	template<typename CharacterType> inline bool isASCIIDigitOrValidHostCharacter(CharacterType charCode)
80	{
81	if (!isASCIIDigitOrPunctuation(charCode))
82	return false;
83
84	// Things the URL Parser rejects:
85	switch (charCode) {
86	case `'#'`:
87	case `'%'`:
88	case `'/'`:
89	case `':'`:
90	case `'?'`:
91	case `'@'`:
92	case `'['`:
93	case `'\\'`:
94	case `']'`:
95	return false;
96	default:
97	return true;
98	}
99	}
100
101	static bool isLookalikeCharacter(const Optional<UChar32>& previousCodePoint, UChar32 charCode)
102	{
103	// This function treats the following as unsafe, lookalike characters:
104	// any non-printable character, any character considered as whitespace,
105	// any ignorable character, and emoji characters related to locks.
106
107	// We also considered the characters in Mozilla's blacklist <http://kb.mozillazine.org/Network.IDN.blacklist_chars>.
108
109	// Some of the characters here will never appear once ICU has encoded.
110	// For example, ICU transforms most spaces into an ASCII space and most
111	// slashes into an ASCII solidus. But one of the two callers uses this
112	// on characters that have not been processed by ICU, so they are needed here.
113
114	if (!u_isprint(charCode) \|\| u_isUWhiteSpace(charCode) \|\| u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT))
115	return true;
116
117	switch (charCode) {
118	case `0x00BC`: / VULGAR FRACTION ONE QUARTER /
119	case `0x00BD`: / VULGAR FRACTION ONE HALF /
120	case `0x00BE`: / VULGAR FRACTION THREE QUARTERS /
121	case `0x00ED`: / LATIN SMALL LETTER I WITH ACUTE /
122	/ 0x0131 LATIN SMALL LETTER DOTLESS I is intentionally not considered a lookalike character because it is visually distinguishable from i and it has legitimate use in the Turkish language. /
123	case `0x01C0`: / LATIN LETTER DENTAL CLICK /
124	case `0x01C3`: / LATIN LETTER RETROFLEX CLICK /
125	case `0x0251`: / LATIN SMALL LETTER ALPHA /
126	case `0x0261`: / LATIN SMALL LETTER SCRIPT G /
127	case `0x027E`: / LATIN SMALL LETTER R WITH FISHHOOK /
128	case `0x02D0`: / MODIFIER LETTER TRIANGULAR COLON /
129	case `0x0335`: / COMBINING SHORT STROKE OVERLAY /
130	case `0x0337`: / COMBINING SHORT SOLIDUS OVERLAY /
131	case `0x0338`: / COMBINING LONG SOLIDUS OVERLAY /
132	case `0x0589`: / ARMENIAN FULL STOP /
133	case `0x05B4`: / HEBREW POINT HIRIQ /
134	case `0x05BC`: / HEBREW POINT DAGESH OR MAPIQ /
135	case `0x05C3`: / HEBREW PUNCTUATION SOF PASUQ /
136	case `0x05F4`: / HEBREW PUNCTUATION GERSHAYIM /
137	case `0x0609`: / ARABIC-INDIC PER MILLE SIGN /
138	case `0x060A`: / ARABIC-INDIC PER TEN THOUSAND SIGN /
139	case `0x0650`: / ARABIC KASRA /
140	case `0x0660`: / ARABIC INDIC DIGIT ZERO /
141	case `0x066A`: / ARABIC PERCENT SIGN /
142	case `0x06D4`: / ARABIC FULL STOP /
143	case `0x06F0`: / EXTENDED ARABIC INDIC DIGIT ZERO /
144	case `0x0701`: / SYRIAC SUPRALINEAR FULL STOP /
145	case `0x0702`: / SYRIAC SUBLINEAR FULL STOP /
146	case `0x0703`: / SYRIAC SUPRALINEAR COLON /
147	case `0x0704`: / SYRIAC SUBLINEAR COLON /
148	case `0x1735`: / PHILIPPINE SINGLE PUNCTUATION /
149	case `0x1D04`: / LATIN LETTER SMALL CAPITAL C /
150	case `0x1D0F`: / LATIN LETTER SMALL CAPITAL O /
151	case `0x1D1C`: / LATIN LETTER SMALL CAPITAL U /
152	case `0x1D20`: / LATIN LETTER SMALL CAPITAL V /
153	case `0x1D21`: / LATIN LETTER SMALL CAPITAL W /
154	case `0x1D22`: / LATIN LETTER SMALL CAPITAL Z /
155	case `0x1ECD`: / LATIN SMALL LETTER O WITH DOT BELOW /
156	case `0x2010`: / HYPHEN /
157	case `0x2011`: / NON-BREAKING HYPHEN /
158	case `0x2024`: / ONE DOT LEADER /
159	case `0x2027`: / HYPHENATION POINT /
160	case `0x2039`: / SINGLE LEFT-POINTING ANGLE QUOTATION MARK /
161	case `0x203A`: / SINGLE RIGHT-POINTING ANGLE QUOTATION MARK /
162	case `0x2041`: / CARET INSERTION POINT /
163	case `0x2044`: / FRACTION SLASH /
164	case `0x2052`: / COMMERCIAL MINUS SIGN /
165	case `0x2153`: / VULGAR FRACTION ONE THIRD /
166	case `0x2154`: / VULGAR FRACTION TWO THIRDS /
167	case `0x2155`: / VULGAR FRACTION ONE FIFTH /
168	case `0x2156`: / VULGAR FRACTION TWO FIFTHS /
169	case `0x2157`: / VULGAR FRACTION THREE FIFTHS /
170	case `0x2158`: / VULGAR FRACTION FOUR FIFTHS /
171	case `0x2159`: / VULGAR FRACTION ONE SIXTH /
172	case `0x215A`: / VULGAR FRACTION FIVE SIXTHS /
173	case `0x215B`: / VULGAR FRACTION ONE EIGHT /
174	case `0x215C`: / VULGAR FRACTION THREE EIGHTHS /
175	case `0x215D`: / VULGAR FRACTION FIVE EIGHTHS /
176	case `0x215E`: / VULGAR FRACTION SEVEN EIGHTHS /
177	case `0x215F`: / FRACTION NUMERATOR ONE /
178	case `0x2212`: / MINUS SIGN /
179	case `0x2215`: / DIVISION SLASH /
180	case `0x2216`: / SET MINUS /
181	case `0x2236`: / RATIO /
182	case `0x233F`: / APL FUNCTIONAL SYMBOL SLASH BAR /
183	case `0x23AE`: / INTEGRAL EXTENSION /
184	case `0x244A`: / OCR DOUBLE BACKSLASH /
185	case `0x2571`: / BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT /
186	case `0x2572`: / BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT /
187	case `0x29F6`: / SOLIDUS WITH OVERBAR /
188	case `0x29F8`: / BIG SOLIDUS /
189	case `0x2AFB`: / TRIPLE SOLIDUS BINARY RELATION /
190	case `0x2AFD`: / DOUBLE SOLIDUS OPERATOR /
191	case `0x2FF0`: / IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT /
192	case `0x2FF1`: / IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO BELOW /
193	case `0x2FF2`: / IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT /
194	case `0x2FF3`: / IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW /
195	case `0x2FF4`: / IDEOGRAPHIC DESCRIPTION CHARACTER FULL SURROUND /
196	case `0x2FF5`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE /
197	case `0x2FF6`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM BELOW /
198	case `0x2FF7`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LEFT /
199	case `0x2FF8`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER LEFT /
200	case `0x2FF9`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER RIGHT /
201	case `0x2FFA`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER LEFT /
202	case `0x2FFB`: / IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID /
203	case `0x3002`: / IDEOGRAPHIC FULL STOP /
204	case `0x3008`: / LEFT ANGLE BRACKET /
205	case `0x3014`: / LEFT TORTOISE SHELL BRACKET /
206	case `0x3015`: / RIGHT TORTOISE SHELL BRACKET /
207	case `0x3033`: / VERTICAL KANA REPEAT MARK UPPER HALF /
208	case `0x3035`: / VERTICAL KANA REPEAT MARK LOWER HALF /
209	case `0x321D`: / PARENTHESIZED KOREAN CHARACTER OJEON /
210	case `0x321E`: / PARENTHESIZED KOREAN CHARACTER O HU /
211	case `0x33AE`: / SQUARE RAD OVER S /
212	case `0x33AF`: / SQUARE RAD OVER S SQUARED /
213	case `0x33C6`: / SQUARE C OVER KG /
214	case `0x33DF`: / SQUARE A OVER M /
215	case `0x05B9`: / HEBREW POINT HOLAM /
216	case `0x05BA`: / HEBREW POINT HOLAM HASER FOR VAV /
217	case `0x05C1`: / HEBREW POINT SHIN DOT /
218	case `0x05C2`: / HEBREW POINT SIN DOT /
219	case `0x05C4`: / HEBREW MARK UPPER DOT /
220	case `0xA731`: / LATIN LETTER SMALL CAPITAL S /
221	case `0xA771`: / LATIN SMALL LETTER DUM /
222	case `0xA789`: / MODIFIER LETTER COLON /
223	case `0xFE14`: / PRESENTATION FORM FOR VERTICAL SEMICOLON /
224	case `0xFE15`: / PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK /
225	case `0xFE3F`: / PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET /
226	case `0xFE5D`: / SMALL LEFT TORTOISE SHELL BRACKET /
227	case `0xFE5E`: / SMALL RIGHT TORTOISE SHELL BRACKET /
228	case `0xFF0E`: / FULLWIDTH FULL STOP /
229	case `0xFF0F`: / FULL WIDTH SOLIDUS /
230	case `0xFF61`: / HALFWIDTH IDEOGRAPHIC FULL STOP /
231	case `0xFFFC`: / OBJECT REPLACEMENT CHARACTER /
232	case `0xFFFD`: / REPLACEMENT CHARACTER /
233	case `0x1F50F`: / LOCK WITH INK PEN /
234	case `0x1F510`: / CLOSED LOCK WITH KEY /
235	case `0x1F511`: / KEY /
236	case `0x1F512`: / LOCK /
237	case `0x1F513`: / OPEN LOCK /
238	return true;
239	case `0x0307`: / COMBINING DOT ABOVE /
240	return previousCodePoint == `0x0237` / LATIN SMALL LETTER DOTLESS J /
241	\|\| previousCodePoint == `0x0131` / LATIN SMALL LETTER DOTLESS I /
242	\|\| previousCodePoint == `0x05D5`; / HEBREW LETTER VAV /
243	case `0x0548`: / ARMENIAN CAPITAL LETTER VO /
244	case `0x054D`: / ARMENIAN CAPITAL LETTER SEH /
245	case `0x0578`: / ARMENIAN SMALL LETTER VO /
246	case `0x057D`: / ARMENIAN SMALL LETTER SEH /
247	return previousCodePoint
248	&& !isASCIIDigitOrValidHostCharacter(previousCodePoint.value())
249	&& !isArmenianScriptCharacter(previousCodePoint.value());
250	case `'.'`:
251	return false;
252	default:
253	return previousCodePoint
254	&& isArmenianLookalikeCharacter(previousCodePoint.value())
255	&& !(isArmenianScriptCharacter(charCode) \|\| isASCIIDigitOrValidHostCharacter(charCode));
256	}
257	}
258
259	void whiteListIDNScript(const char* scriptName)
260	{
261	int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, scriptName);
262	if (script >= `0` && script < USCRIPT_CODE_LIMIT) {
263	size_t index = script / `32`;
264	uint32_t mask = `1` << (script % `32`);
265	IDNScriptWhiteList[index] \|= mask;
266	}
267	}
268
269	void initializeDefaultIDNScriptWhiteList()
270	{
271	const char* defaultIDNScriptWhiteList[`20`] = {
272	"Common",
273	"Inherited",
274	"Arabic",
275	"Armenian",
276	"Bopomofo",
277	"Canadian_Aboriginal",
278	"Devanagari",
279	"Deseret",
280	"Gujarati",
281	"Gurmukhi",
282	"Hangul",
283	"Han",
284	"Hebrew",
285	"Hiragana",
286	"Katakana_Or_Hiragana",
287	"Katakana",
288	"Latin",
289	"Tamil",
290	"Thai",
291	"Yi",
292	};
293	for (const char* scriptName : defaultIDNScriptWhiteList)
294	whiteListIDNScript(scriptName);
295	}
296
297	static bool allCharactersInIDNScriptWhiteList(const UChar* buffer, int32_t length)
298	{
299	loadIDNScriptWhiteList();
300	int32_t i = `0`;
301	Optional<UChar32> previousCodePoint;
302	while (i < length) {
303	UChar32 c;
304	U16_NEXT(buffer, i, length, c)
305	UErrorCode error = U_ZERO_ERROR;
306	UScriptCode script = uscript_getScript(c, &error);
307	if (error != U_ZERO_ERROR) {
308	LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
309	return false;
310	}
311	if (script < `0`) {
312	LOG_ERROR("got negative number for script code from ICU: %d", script);
313	return false;
314	}
315	if (script >= USCRIPT_CODE_LIMIT)
316	return false;
317
318	size_t index = script / `32`;
319	uint32_t mask = `1` << (script % `32`);
320	if (!(IDNScriptWhiteList[index] & mask))
321	return false;
322
323	if (isLookalikeCharacter(previousCodePoint, c))
324	return false;
325	previousCodePoint = c;
326	}
327	return true;
328	}
329
330	template<typename Func>
331	static inline bool isSecondLevelDomainNameAllowedByTLDRules(const UChar* buffer, int32_t length, Func characterIsAllowed)
332	{
333	ASSERT(length > `0`);
334
335	for (int32_t i = length - `1`; i >= `0`; --i) {
336	UChar ch = buffer[i];
337
338	if (characterIsAllowed(ch))
339	continue;
340
341	// Only check the second level domain. Lower level registrars may have different rules.
342	if (ch == `'.'`)
343	break;
344
345	return false;
346	}
347	return true;
348	}
349
350	#define CHECK_RULES_IF_SUFFIX_MATCHES(suffix, function) \
351	{ \
352	static const int32_t suffixLength = sizeof(suffix) / sizeof(suffix[0]); \
353	if (length > suffixLength && !memcmp(buffer + length - suffixLength, suffix, sizeof(suffix))) \
354	return isSecondLevelDomainNameAllowedByTLDRules(buffer, length - suffixLength, function); \
355	}
356
357	static bool isRussianDomainNameCharacter(UChar ch)
358	{
359	// Only modern Russian letters, digits and dashes are allowed.
360	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
361	}
362
363	static bool allCharactersAllowedByTLDRules(const UChar* buffer, int32_t length)
364	{
365	// Skip trailing dot for root domain.
366	if (buffer[length - `1`] == `'.'`)
367	length--;
368
369	// http://cctld.ru/files/pdf/docs/rules_ru-rf.pdf
370	static const UChar cyrillicRF[] = {
371	`'.'`,
372	`0x0440`, // CYRILLIC SMALL LETTER ER
373	`0x0444`, // CYRILLIC SMALL LETTER EF
374	};
375	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRF, isRussianDomainNameCharacter);
376
377	// http://rusnames.ru/rules.pl
378	static const UChar cyrillicRUS[] = {
379	`'.'`,
380	`0x0440`, // CYRILLIC SMALL LETTER ER
381	`0x0443`, // CYRILLIC SMALL LETTER U
382	`0x0441`, // CYRILLIC SMALL LETTER ES
383	};
384	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRUS, isRussianDomainNameCharacter);
385
386	// http://ru.faitid.org/projects/moscow/documents/moskva/idn
387	static const UChar cyrillicMOSKVA[] = {
388	`'.'`,
389	`0x043C`, // CYRILLIC SMALL LETTER EM
390	`0x043E`, // CYRILLIC SMALL LETTER O
391	`0x0441`, // CYRILLIC SMALL LETTER ES
392	`0x043A`, // CYRILLIC SMALL LETTER KA
393	`0x0432`, // CYRILLIC SMALL LETTER VE
394	`0x0430`, // CYRILLIC SMALL LETTER A
395	};
396	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMOSKVA, isRussianDomainNameCharacter);
397
398	// http://www.dotdeti.ru/foruser/docs/regrules.php
399	static const UChar cyrillicDETI[] = {
400	`'.'`,
401	`0x0434`, // CYRILLIC SMALL LETTER DE
402	`0x0435`, // CYRILLIC SMALL LETTER IE
403	`0x0442`, // CYRILLIC SMALL LETTER TE
404	`0x0438`, // CYRILLIC SMALL LETTER I
405	};
406	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicDETI, isRussianDomainNameCharacter);
407
408	// http://corenic.org - rules not published. The word is Russian, so only allowing Russian at this time,
409	// although we may need to revise the checks if this ends up being used with other languages spoken in Russia.
410	static const UChar cyrillicONLAYN[] = {
411	`'.'`,
412	`0x043E`, // CYRILLIC SMALL LETTER O
413	`0x043D`, // CYRILLIC SMALL LETTER EN
414	`0x043B`, // CYRILLIC SMALL LETTER EL
415	`0x0430`, // CYRILLIC SMALL LETTER A
416	`0x0439`, // CYRILLIC SMALL LETTER SHORT I
417	`0x043D`, // CYRILLIC SMALL LETTER EN
418	};
419	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicONLAYN, isRussianDomainNameCharacter);
420
421	// http://corenic.org - same as above.
422	static const UChar cyrillicSAYT[] = {
423	`'.'`,
424	`0x0441`, // CYRILLIC SMALL LETTER ES
425	`0x0430`, // CYRILLIC SMALL LETTER A
426	`0x0439`, // CYRILLIC SMALL LETTER SHORT I
427	`0x0442`, // CYRILLIC SMALL LETTER TE
428	};
429	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSAYT, isRussianDomainNameCharacter);
430
431	// http://pir.org/products/opr-domain/ - rules not published. According to the registry site,
432	// the intended audience is "Russian and other Slavic-speaking markets".
433	// Chrome appears to only allow Russian, so sticking with that for now.
434	static const UChar cyrillicORG[] = {
435	`'.'`,
436	`0x043E`, // CYRILLIC SMALL LETTER O
437	`0x0440`, // CYRILLIC SMALL LETTER ER
438	`0x0433`, // CYRILLIC SMALL LETTER GHE
439	};
440	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicORG, isRussianDomainNameCharacter);
441
442	// http://cctld.by/rules.html
443	static const UChar cyrillicBEL[] = {
444	`'.'`,
445	`0x0431`, // CYRILLIC SMALL LETTER BE
446	`0x0435`, // CYRILLIC SMALL LETTER IE
447	`0x043B`, // CYRILLIC SMALL LETTER EL
448	};
449	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBEL, [](UChar ch) {
450	// Russian and Byelorussian letters, digits and dashes are allowed.
451	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| ch == `0x0456` \|\| ch == `0x045E` \|\| ch == `0x2019` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
452	});
453
454	// http://www.nic.kz/docs/poryadok_vnedreniya_kaz_ru.pdf
455	static const UChar cyrillicKAZ[] = {
456	`'.'`,
457	`0x049B`, // CYRILLIC SMALL LETTER KA WITH DESCENDER
458	`0x0430`, // CYRILLIC SMALL LETTER A
459	`0x0437`, // CYRILLIC SMALL LETTER ZE
460	};
461	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicKAZ, [](UChar ch) {
462	// Kazakh letters, digits and dashes are allowed.
463	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| ch == `0x04D9` \|\| ch == `0x0493` \|\| ch == `0x049B` \|\| ch == `0x04A3` \|\| ch == `0x04E9` \|\| ch == `0x04B1` \|\| ch == `0x04AF` \|\| ch == `0x04BB` \|\| ch == `0x0456` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
464	});
465
466	// http://uanic.net/docs/documents-ukr/Rules%20of%20UKR_v4.0.pdf
467	static const UChar cyrillicUKR[] = {
468	`'.'`,
469	`0x0443`, // CYRILLIC SMALL LETTER U
470	`0x043A`, // CYRILLIC SMALL LETTER KA
471	`0x0440`, // CYRILLIC SMALL LETTER ER
472	};
473	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicUKR, [](UChar ch) {
474	// Russian and Ukrainian letters, digits and dashes are allowed.
475	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| ch == `0x0491` \|\| ch == `0x0404` \|\| ch == `0x0456` \|\| ch == `0x0457` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
476	});
477
478	// http://www.rnids.rs/data/DOKUMENTI/idn-srb-policy-termsofuse-v1.4-eng.pdf
479	static const UChar cyrillicSRB[] = {
480	`'.'`,
481	`0x0441`, // CYRILLIC SMALL LETTER ES
482	`0x0440`, // CYRILLIC SMALL LETTER ER
483	`0x0431`, // CYRILLIC SMALL LETTER BE
484	};
485	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSRB, [](UChar ch) {
486	// Serbian letters, digits and dashes are allowed.
487	return (ch >= `0x0430` && ch <= `0x0438`) \|\| (ch >= `0x043A` && ch <= `0x0448`) \|\| ch == `0x0452` \|\| ch == `0x0458` \|\| ch == `0x0459` \|\| ch == `0x045A` \|\| ch == `0x045B` \|\| ch == `0x045F` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
488	});
489
490	// http://marnet.mk/doc/pravilnik-mk-mkd.pdf
491	static const UChar cyrillicMKD[] = {
492	`'.'`,
493	`0x043C`, // CYRILLIC SMALL LETTER EM
494	`0x043A`, // CYRILLIC SMALL LETTER KA
495	`0x0434`, // CYRILLIC SMALL LETTER DE
496	};
497	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMKD, [](UChar ch) {
498	// Macedonian letters, digits and dashes are allowed.
499	return (ch >= `0x0430` && ch <= `0x0438`) \|\| (ch >= `0x043A` && ch <= `0x0448`) \|\| ch == `0x0453` \|\| ch == `0x0455` \|\| ch == `0x0458` \|\| ch == `0x0459` \|\| ch == `0x045A` \|\| ch == `0x045C` \|\| ch == `0x045F` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
500	});
501
502	// https://www.mon.mn/cs/
503	static const UChar cyrillicMON[] = {
504	`'.'`,
505	`0x043C`, // CYRILLIC SMALL LETTER EM
506	`0x043E`, // CYRILLIC SMALL LETTER O
507	`0x043D`, // CYRILLIC SMALL LETTER EN
508	};
509	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMON, [](UChar ch) {
510	// Mongolian letters, digits and dashes are allowed.
511	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| ch == `0x04E9` \|\| ch == `0x04AF` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
512	});
513
514	// https://www.icann.org/sites/default/files/packages/lgr/lgr-second-level-bulgarian-30aug16-en.html
515	static const UChar cyrillicBG[] = {
516	`'.'`,
517	`0x0431`, // CYRILLIC SMALL LETTER BE
518	`0x0433` // CYRILLIC SMALL LETTER GHE
519	};
520	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBG, [](UChar ch) {
521	return (ch >= `0x0430` && ch <= `0x044A`) \|\| ch == `0x044C` \|\| (ch >= `0x044E` && ch <= `0x0450`) \|\| ch == `0x045D` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
522	});
523
524	// Not a known top level domain with special rules.
525	return false;
526	}
527
528	// Return value of null means no mapping is necessary.
529	Optional<String> mapHostName(const String& hostName, const Optional<URLDecodeFunction>& decodeFunction)
530	{
531	if (hostName.length() > hostNameBufferLength)
532	return String ();
533
534	if (!hostName.length())
535	return String ();
536
537	String string;
538	if (decodeFunction && string.contains(`'%'`))
539	string = (*decodeFunction)(hostName);
540	else
541	string = hostName;
542
543	unsigned length = string.length();
544
545	auto sourceBuffer = string.charactersWithNullTermination();
546
547	UChar destinationBuffer[hostNameBufferLength];
548	UErrorCode uerror = U_ZERO_ERROR;
549	UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
550	int32_t numCharactersConverted = (decodeFunction ? uidna_nameToASCII : uidna_nameToUnicode)(&URLParser::internationalDomainNameTranscoder(), sourceBuffer.data(), length, destinationBuffer, hostNameBufferLength, &processingDetails, &uerror);
551	if (length && (U_FAILURE(uerror) \|\| processingDetails.errors))
552	return nullopt;
553
554	if (numCharactersConverted == static_cast<int32_t>(length) && !memcmp(sourceBuffer.data(), destinationBuffer, length * sizeof(UChar)))
555	return String ();
556
557	if (!decodeFunction && !allCharactersInIDNScriptWhiteList(destinationBuffer, numCharactersConverted) && !allCharactersAllowedByTLDRules(destinationBuffer, numCharactersConverted))
558	return String ();
559
560	return String (destinationBuffer, numCharactersConverted);
561	}
562
563	using MappingRangesVector = Optional<Vector<std::tuple<unsigned, unsigned, String>>>;
564
565	static void collectRangesThatNeedMapping(const String& string, unsigned location, unsigned length, MappingRangesVector& array, const Optional<URLDecodeFunction>& decodeFunction)
566	{
567	// Generally, we want to optimize for the case where there is one host name that does not need mapping.
568	// Therefore, we use null to indicate no mapping here and an empty array to indicate error.
569
570	String substring = string.substringSharingImpl(location, length);
571	Optional<String> host = mapHostName(substring, decodeFunction);
572
573	if (host && !*host)
574	return;
575
576	if (!array)
577	array = Vector<std::tuple<unsigned, unsigned, String>>();
578
579	if (host)
580	array ->constructAndAppend(location, length, *host);
581	}
582
583	static void applyHostNameFunctionToMailToURLString(const String& string, const Optional<URLDecodeFunction>& decodeFunction, MappingRangesVector& array)
584	{
585	// In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' character.
586	// Skip quoted strings so that characters in them don't confuse us.
587	// When we find a '?' character, we are past the part of the URL that contains host names.
588
589	unsigned stringLength = string.length();
590	unsigned current = `0`;
591
592	while (`1`) {
593	// Find start of host name or of quoted string.
594	auto hostNameOrStringStart = string.find([](UChar ch) {
595	return ch == `'"'` \|\| ch == `'@'` \|\| ch == `'?'`;
596	}, current);
597	if (hostNameOrStringStart == notFound)
598	return;
599
600	UChar c = string [hostNameOrStringStart];
601	current = hostNameOrStringStart + `1`;
602
603	if (c == `'?'`)
604	return;
605
606	if (c == `'@'`) {
607	// Find end of host name.
608	unsigned hostNameStart = current;
609	auto hostNameEnd = string.find([](UChar ch) {
610	return ch == `'>'` \|\| ch == `','` \|\| ch == `'?'`;
611	}, current);
612
613	bool done;
614	if (hostNameEnd == notFound) {
615	hostNameEnd = stringLength;
616	done = true;
617	} else {
618	current = hostNameEnd;
619	done = false;
620	}
621
622	// Process host name range.
623	collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction);
624
625	if (done)
626	return;
627	} else {
628	// Skip quoted string.
629	ASSERT(c == `'"'`);
630	while (`1`) {
631	auto escapedCharacterOrStringEnd = string.find([](UChar ch) {
632	return ch == `'"'` \|\| ch == `'\\'`;
633	}, current);
634	if (escapedCharacterOrStringEnd == notFound)
635	return;
636
637	c = string [escapedCharacterOrStringEnd];
638	current = escapedCharacterOrStringEnd + `1`;
639
640	// If we are the end of the string, then break from the string loop back to the host name loop.
641	if (c == `'"'`)
642	break;
643
644	// Skip escaped character.
645	ASSERT(c == `'\\'`);
646	if (current == stringLength)
647	return;
648
649	++current;
650	}
651	}
652	}
653	}
654
655	static void applyHostNameFunctionToURLString(const String& string, const Optional<URLDecodeFunction>& decodeFunction, MappingRangesVector& array)
656	{
657	// Find hostnames. Too bad we can't use any real URL-parsing code to do this,
658	// but we have to do it before doing all the %-escaping, and this is the only
659	// code we have that parses mailto URLs anyway.
660
661	// Maybe we should implement this using a character buffer instead?
662
663	if (protocolIs(string, "mailto")) {
664	applyHostNameFunctionToMailToURLString(string, decodeFunction, array);
665	return;
666	}
667
668	// Find the host name in a hierarchical URL.
669	// It comes after a "://" sequence, with scheme characters preceding.
670	// If ends with the end of the string or a ":", "/", or a "?".
671	// If there is a "@" character, the host part is just the part after the "@".
672	static const char* separator = "://";
673	auto separatorIndex = string.find(separator);
674	if (separatorIndex == notFound)
675	return;
676
677	unsigned authorityStart = separatorIndex + strlen(separator);
678
679	// Check that all characters before the :// are valid scheme characters.
680	if (StringView { string }.left(separatorIndex).contains([](UChar character) {
681	return !(isASCIIAlphanumeric(character) \|\| character == `'+'` \|\| character == `'-'` \|\| character == `'.'`);
682	}))
683	return;
684
685	// Find terminating character.
686	auto hostNameTerminator = string.find([](UChar character) {
687	return character == `':'` \|\| character == `'/'` \|\| character == `'?'` \|\| character == `'#'`;
688	}, authorityStart);
689	unsigned hostNameEnd = hostNameTerminator == notFound ? string.length() : hostNameTerminator;
690
691	// Find "@" for the start of the host name.
692	auto userInfoTerminator = StringView { string }.left(hostNameEnd).find(`'@'`, authorityStart);
693	unsigned hostNameStart = userInfoTerminator == notFound ? authorityStart : userInfoTerminator + `1`;
694
695	collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction);
696	}
697
698	String mapHostNames(const String& string, const Optional<URLDecodeFunction>& decodeFunction)
699	{
700	// Generally, we want to optimize for the case where there is one host name that does not need mapping.
701
702	if (decodeFunction && string.isAllASCII())
703	return string;
704
705	// Make a list of ranges that actually need mapping.
706	MappingRangesVector hostNameRanges;
707	applyHostNameFunctionToURLString(string, decodeFunction, hostNameRanges);
708	if (!hostNameRanges)
709	return string;
710
711	if (hostNameRanges ->isEmpty())
712	return { };
713
714	// Do the mapping.
715	String result = string;
716	while (!hostNameRanges ->isEmpty()) {
717	auto [location, length, mappedHostName] = hostNameRanges ->takeLast();
718	result = result.replace(location, length, mappedHostName);
719	}
720	return result;
721	}
722
723	static String escapeUnsafeCharacters(const String& sourceBuffer)
724	{
725	unsigned length = sourceBuffer.length();
726
727	Optional<UChar32> previousCodePoint;
728
729	unsigned i;
730	for (i = `0`; i < length; ) {
731	UChar32 c = sourceBuffer.characterStartingAt(i);
732	if (isLookalikeCharacter(previousCodePoint, sourceBuffer.characterStartingAt(i)))
733	break;
734	previousCodePoint = c;
735	i += U16_LENGTH(c);
736	}
737
738	if (i == length)
739	return sourceBuffer;
740
741	Vector<UChar, urlBytesBufferLength> outBuffer;
742
743	outBuffer.grow(i);
744	if (sourceBuffer.is8Bit())
745	StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters8(), i);
746	else
747	StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters16(), i);
748
749	for (; i < length; ) {
750	UChar32 c = sourceBuffer.characterStartingAt(i);
751	unsigned characterLength = U16_LENGTH(c);
752	if (isLookalikeCharacter(previousCodePoint, c)) {
753	uint8_t utf8Buffer[`4`];
754	size_t offset = `0`;
755	UBool failure = false;
756	U8_APPEND(utf8Buffer, offset, `4`, c, failure);
757	ASSERT_UNUSED(failure, !failure);
758
759	for (size_t j = `0`; j < offset; ++j) {
760	outBuffer.append(`'%'`);
761	outBuffer.append(upperNibbleToASCIIHexDigit(utf8Buffer[j]));
762	outBuffer.append(lowerNibbleToASCIIHexDigit(utf8Buffer[j]));
763	}
764	} else {
765	for (unsigned j = `0`; j < characterLength; ++j)
766	outBuffer.append(sourceBuffer [i + j]);
767	}
768	previousCodePoint = c;
769	i += characterLength;
770	}
771
772	return String::adopt(WTFMove(outBuffer));
773	}
774
775	String userVisibleURL(const CString& url)
776	{
777	auto* before = reinterpret_cast<const unsigned char*>(url.data());
778	int length = url.length();
779
780	if (!length)
781	return { };
782
783	bool mayNeedHostNameDecoding = false;
784
785	Checked<int, RecordOverflow> bufferLength = length;
786	bufferLength = bufferLength * `3` + `1`; // The buffer should be large enough to %-escape every character.
787	if (bufferLength.hasOverflowed())
788	return { };
789	Vector<char, urlBytesBufferLength> after(bufferLength.unsafeGet());
790
791	char* q = after.data();
792	{
793	const unsigned char* p = before;
794	for (int i = `0`; i < length; i++) {
795	unsigned char c = p[i];
796	// unescape escape sequences that indicate bytes greater than 0x7f
797	if (c == `'%'` && i + `2` < length && isASCIIHexDigit(p[i + `1`]) && isASCIIHexDigit(p[i + `2`])) {
798	auto u = toASCIIHexValue(p[i + `1`], p[i + `2`]);
799	if (u > `0x7f`) {
800	// unescape
801	*q++ = u;
802	} else {
803	// do not unescape
804	*q++ = p[i];
805	*q++ = p[i + `1`];
806	*q++ = p[i + `2`];
807	}
808	i += `2`;
809	} else {
810	*q++ = c;
811
812	// Check for "xn--" in an efficient, non-case-sensitive, way.
813	if (c == `'-'` && i >= `3` && !mayNeedHostNameDecoding && (q[-`4`] \| `0x20`) == `'x'` && (q[-`3`] \| `0x20`) == `'n'` && q[-`2`] == `'-'`)
814	mayNeedHostNameDecoding = true;
815	}
816	}
817	*q = `'\0'`;
818	}
819
820	// Check string to see if it can be converted to display using UTF-8
821	String result = String::fromUTF8(after.data());
822	if (!result) {
823	// Could not convert to UTF-8.
824	// Convert characters greater than 0x7f to escape sequences.
825	// Shift current string to the end of the buffer
826	// then we will copy back bytes to the start of the buffer
827	// as we convert.
828	int afterlength = q - after.data();
829	char* p = after.data() + bufferLength.unsafeGet() - afterlength - `1`;
830	memmove(p, after.data(), afterlength + `1`); // copies trailing '\0'
831	char* q = after.data();
832	while (*p) {
833	unsigned char c = *p;
834	if (c > `0x7f`) {
835	*q++ = `'%'`;
836	*q++ = upperNibbleToASCIIHexDigit(c);
837	*q++ = lowerNibbleToASCIIHexDigit(c);
838	} else
839	q++ = p;
840	p++;
841	}
842	*q = `'\0'`;
843	// Note: after.data() points to a null-terminated, pure ASCII string.
844	result = String::fromUTF8(after.data());
845	ASSERT(!!result);
846	}
847
848	// Note: result is UTF–16 string, created from either a valid UTF-8 string,
849	// or a pure ASCII string (where all bytes with the high bit set are
850	// percent-encoded).
851
852	if (mayNeedHostNameDecoding) {
853	// FIXME: Is it good to ignore the failure of mapHostNames and keep result intact?
854	auto mappedResult = mapHostNames(result, nullopt);
855	if (!!mappedResult)
856	result = mappedResult;
857	}
858
859	return escapeUnsafeCharacters(normalizedNFC(result));
860	}
861
862	} // namespace URLHelpers
863	} // namespace WTF
864

Browse the source code of jsc/Source/WTF/wtf/URLHelpers.cpp