URLHelpers.cpp source code [webcore/Source/WTF/wtf/URLHelpers.cpp]

1	/*
2	* Copyright (C) 2005-2019 Apple Inc. All rights reserved.
3	* Copyright (C) 2018 Igalia S.L.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	*
9	* 1. Redistributions of source code must retain the above copyright
10	* notice, this list of conditions and the following disclaimer.
11	* 2. Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	* 3. Neither the name of Apple Inc. ("Apple") nor the names of
15	* its contributors may be used to endorse or promote products derived
16	* from this software without specific prior written permission.
17	*
18	* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
19	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21	* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
22	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28	*/
29
30	#include "config.h"
31	#include "URLHelpers.h"
32
33	#include "URLParser.h"
34	#include <mutex>
35	#include <unicode/uidna.h>
36	#include <unicode/uscript.h>
37	#include <wtf/Optional.h>
38	#include <wtf/text/WTFString.h>
39
40	namespace WTF {
41	namespace URLHelpers {
42
43	// Needs to be big enough to hold an IDN-encoded name.
44	// For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
45	const unsigned hostNameBufferLength = `2048`;
46	const unsigned urlBytesBufferLength = `2048`;
47
48	static uint32_t IDNScriptWhiteList[(USCRIPT_CODE_LIMIT + `31`) / `32`];
49
50	#if !PLATFORM(COCOA)
51
52	// Cocoa has an implementation that uses a whitelist in /Library or ~/Library,
53	// if it exists.
54	void loadIDNScriptWhiteList()
55	{
56	static std::once_flag flag;
57	std::call_once(flag, initializeDefaultIDNScriptWhiteList);
58	}
59
60	#endif // !PLATFORM(COCOA)
61
62	static bool isArmenianLookalikeCharacter(UChar32 codePoint)
63	{
64	return codePoint == `0x0548` \|\| codePoint == `0x054D` \|\| codePoint == `0x0578` \|\| codePoint == `0x057D`;
65	}
66
67	static bool isArmenianScriptCharacter(UChar32 codePoint)
68	{
69	UErrorCode error = U_ZERO_ERROR;
70	UScriptCode script = uscript_getScript(codePoint, &error);
71	if (error != U_ZERO_ERROR) {
72	LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
73	return false;
74	}
75
76	return script == USCRIPT_ARMENIAN;
77	}
78
79	template<typename CharacterType> inline bool isASCIIDigitOrValidHostCharacter(CharacterType charCode)
80	{
81	if (!isASCIIDigitOrPunctuation(charCode))
82	return false;
83
84	// Things the URL Parser rejects:
85	switch (charCode) {
86	case `'#'`:
87	case `'%'`:
88	case `'/'`:
89	case `':'`:
90	case `'?'`:
91	case `'@'`:
92	case `'['`:
93	case `'\\'`:
94	case `']'`:
95	return false;
96	default:
97	return true;
98	}
99	}
100
101	static bool isLookalikeCharacter(const Optional<UChar32>& previousCodePoint, UChar32 charCode)
102	{
103	// This function treats the following as unsafe, lookalike characters:
104	// any non-printable character, any character considered as whitespace,
105	// any ignorable character, and emoji characters related to locks.
106
107	// We also considered the characters in Mozilla's blacklist <http://kb.mozillazine.org/Network.IDN.blacklist_chars>.
108
109	// Some of the characters here will never appear once ICU has encoded.
110	// For example, ICU transforms most spaces into an ASCII space and most
111	// slashes into an ASCII solidus. But one of the two callers uses this
112	// on characters that have not been processed by ICU, so they are needed here.
113
114	if (!u_isprint(charCode) \|\| u_isUWhiteSpace(charCode) \|\| u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT))
115	return true;
116
117	switch (charCode) {
118	case `0x00BC`: / VULGAR FRACTION ONE QUARTER /
119	case `0x00BD`: / VULGAR FRACTION ONE HALF /
120	case `0x00BE`: / VULGAR FRACTION THREE QUARTERS /
121	case `0x00ED`: / LATIN SMALL LETTER I WITH ACUTE /
122	/ 0x0131 LATIN SMALL LETTER DOTLESS I is intentionally not considered a lookalike character because it is visually distinguishable from i and it has legitimate use in the Turkish language. /
123	case `0x01C0`: / LATIN LETTER DENTAL CLICK /
124	case `0x01C3`: / LATIN LETTER RETROFLEX CLICK /
125	case `0x0251`: / LATIN SMALL LETTER ALPHA /
126	case `0x0261`: / LATIN SMALL LETTER SCRIPT G /
127	case `0x027E`: / LATIN SMALL LETTER R WITH FISHHOOK /
128	case `0x02D0`: / MODIFIER LETTER TRIANGULAR COLON /
129	case `0x0335`: / COMBINING SHORT STROKE OVERLAY /
130	case `0x0337`: / COMBINING SHORT SOLIDUS OVERLAY /
131	case `0x0338`: / COMBINING LONG SOLIDUS OVERLAY /
132	case `0x0589`: / ARMENIAN FULL STOP /
133	case `0x05B4`: / HEBREW POINT HIRIQ /
134	case `0x05BC`: / HEBREW POINT DAGESH OR MAPIQ /
135	case `0x05C3`: / HEBREW PUNCTUATION SOF PASUQ /
136	case `0x05F4`: / HEBREW PUNCTUATION GERSHAYIM /
137	case `0x0609`: / ARABIC-INDIC PER MILLE SIGN /
138	case `0x060A`: / ARABIC-INDIC PER TEN THOUSAND SIGN /
139	case `0x0650`: / ARABIC KASRA /
140	case `0x0660`: / ARABIC INDIC DIGIT ZERO /
141	case `0x066A`: / ARABIC PERCENT SIGN /
142	case `0x06D4`: / ARABIC FULL STOP /
143	case `0x06F0`: / EXTENDED ARABIC INDIC DIGIT ZERO /
144	case `0x0701`: / SYRIAC SUPRALINEAR FULL STOP /
145	case `0x0702`: / SYRIAC SUBLINEAR FULL STOP /
146	case `0x0703`: / SYRIAC SUPRALINEAR COLON /
147	case `0x0704`: / SYRIAC SUBLINEAR COLON /
148	case `0x1735`: / PHILIPPINE SINGLE PUNCTUATION /
149	case `0x1D04`: / LATIN LETTER SMALL CAPITAL C /
150	case `0x1D0F`: / LATIN LETTER SMALL CAPITAL O /
151	case `0x1D1C`: / LATIN LETTER SMALL CAPITAL U /
152	case `0x1D20`: / LATIN LETTER SMALL CAPITAL V /
153	case `0x1D21`: / LATIN LETTER SMALL CAPITAL W /
154	case `0x1D22`: / LATIN LETTER SMALL CAPITAL Z /
155	case `0x1ECD`: / LATIN SMALL LETTER O WITH DOT BELOW /
156	case `0x2010`: / HYPHEN /
157	case `0x2011`: / NON-BREAKING HYPHEN /
158	case `0x2024`: / ONE DOT LEADER /
159	case `0x2027`: / HYPHENATION POINT /
160	case `0x2039`: / SINGLE LEFT-POINTING ANGLE QUOTATION MARK /
161	case `0x203A`: / SINGLE RIGHT-POINTING ANGLE QUOTATION MARK /
162	case `0x2041`: / CARET INSERTION POINT /
163	case `0x2044`: / FRACTION SLASH /
164	case `0x2052`: / COMMERCIAL MINUS SIGN /
165	case `0x2153`: / VULGAR FRACTION ONE THIRD /
166	case `0x2154`: / VULGAR FRACTION TWO THIRDS /
167	case `0x2155`: / VULGAR FRACTION ONE FIFTH /
168	case `0x2156`: / VULGAR FRACTION TWO FIFTHS /
169	case `0x2157`: / VULGAR FRACTION THREE FIFTHS /
170	case `0x2158`: / VULGAR FRACTION FOUR FIFTHS /
171	case `0x2159`: / VULGAR FRACTION ONE SIXTH /
172	case `0x215A`: / VULGAR FRACTION FIVE SIXTHS /
173	case `0x215B`: / VULGAR FRACTION ONE EIGHT /
174	case `0x215C`: / VULGAR FRACTION THREE EIGHTHS /
175	case `0x215D`: / VULGAR FRACTION FIVE EIGHTHS /
176	case `0x215E`: / VULGAR FRACTION SEVEN EIGHTHS /
177	case `0x215F`: / FRACTION NUMERATOR ONE /
178	case `0x2212`: / MINUS SIGN /
179	case `0x2215`: / DIVISION SLASH /
180	case `0x2216`: / SET MINUS /
181	case `0x2236`: / RATIO /
182	case `0x233F`: / APL FUNCTIONAL SYMBOL SLASH BAR /
183	case `0x23AE`: / INTEGRAL EXTENSION /
184	case `0x244A`: / OCR DOUBLE BACKSLASH /
185	case `0x2571`: / BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT /
186	case `0x2572`: / BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT /
187	case `0x29F6`: / SOLIDUS WITH OVERBAR /
188	case `0x29F8`: / BIG SOLIDUS /
189	case `0x2AFB`: / TRIPLE SOLIDUS BINARY RELATION /
190	case `0x2AFD`: / DOUBLE SOLIDUS OPERATOR /
191	case `0x2FF0`: / IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT /
192	case `0x2FF1`: / IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO BELOW /
193	case `0x2FF2`: / IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT /
194	case `0x2FF3`: / IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW /
195	case `0x2FF4`: / IDEOGRAPHIC DESCRIPTION CHARACTER FULL SURROUND /
196	case `0x2FF5`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE /
197	case `0x2FF6`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM BELOW /
198	case `0x2FF7`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LEFT /
199	case `0x2FF8`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER LEFT /
200	case `0x2FF9`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER RIGHT /
201	case `0x2FFA`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER LEFT /
202	case `0x2FFB`: / IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID /
203	case `0x3002`: / IDEOGRAPHIC FULL STOP /
204	case `0x3008`: / LEFT ANGLE BRACKET /
205	case `0x3014`: / LEFT TORTOISE SHELL BRACKET /
206	case `0x3015`: / RIGHT TORTOISE SHELL BRACKET /
207	case `0x3033`: / VERTICAL KANA REPEAT MARK UPPER HALF /
208	case `0x3035`: / VERTICAL KANA REPEAT MARK LOWER HALF /
209	case `0x321D`: / PARENTHESIZED KOREAN CHARACTER OJEON /
210	case `0x321E`: / PARENTHESIZED KOREAN CHARACTER O HU /
211	case `0x33AE`: / SQUARE RAD OVER S /
212	case `0x33AF`: / SQUARE RAD OVER S SQUARED /
213	case `0x33C6`: / SQUARE C OVER KG /
214	case `0x33DF`: / SQUARE A OVER M /
215	case `0x05B9`: / HEBREW POINT HOLAM /
216	case `0x05BA`: / HEBREW POINT HOLAM HASER FOR VAV /
217	case `0x05C1`: / HEBREW POINT SHIN DOT /
218	case `0x05C2`: / HEBREW POINT SIN DOT /
219	case `0x05C4`: / HEBREW MARK UPPER DOT /
220	case `0xA731`: / LATIN LETTER SMALL CAPITAL S /
221	case `0xA771`: / LATIN SMALL LETTER DUM /
222	case `0xA789`: / MODIFIER LETTER COLON /
223	case `0xFE14`: / PRESENTATION FORM FOR VERTICAL SEMICOLON /
224	case `0xFE15`: / PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK /
225	case `0xFE3F`: / PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET /
226	case `0xFE5D`: / SMALL LEFT TORTOISE SHELL BRACKET /
227	case `0xFE5E`: / SMALL RIGHT TORTOISE SHELL BRACKET /
228	case `0xFF0E`: / FULLWIDTH FULL STOP /
229	case `0xFF0F`: / FULL WIDTH SOLIDUS /
230	case `0xFF61`: / HALFWIDTH IDEOGRAPHIC FULL STOP /
231	case `0xFFFC`: / OBJECT REPLACEMENT CHARACTER /
232	case `0xFFFD`: / REPLACEMENT CHARACTER /
233	case `0x1F50F`: / LOCK WITH INK PEN /
234	case `0x1F510`: / CLOSED LOCK WITH KEY /
235	case `0x1F511`: / KEY /
236	case `0x1F512`: / LOCK /
237	case `0x1F513`: / OPEN LOCK /
238	return true;
239	case `0x0307`: / COMBINING DOT ABOVE /
240	return previousCodePoint == `0x0237` / LATIN SMALL LETTER DOTLESS J /
241	\|\| previousCodePoint == `0x0131` / LATIN SMALL LETTER DOTLESS I /
242	\|\| previousCodePoint == `0x05D5`; / HEBREW LETTER VAV /
243	case `0x0548`: / ARMENIAN CAPITAL LETTER VO /
244	case `0x054D`: / ARMENIAN CAPITAL LETTER SEH /
245	case `0x0578`: / ARMENIAN SMALL LETTER VO /
246	case `0x057D`: / ARMENIAN SMALL LETTER SEH /
247	return previousCodePoint
248	&& !isASCIIDigitOrValidHostCharacter(previousCodePoint.value())
249	&& !isArmenianScriptCharacter(previousCodePoint.value());
250	case `'.'`:
251	return false;
252	default:
253	return previousCodePoint
254	&& isArmenianLookalikeCharacter(previousCodePoint.value())
255	&& !(isArmenianScriptCharacter(charCode) \|\| isASCIIDigitOrValidHostCharacter(charCode));
256	}
257	}
258
259	void whiteListIDNScript(const char* scriptName)
260	{
261	int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, scriptName);
262	if (script >= `0` && script < USCRIPT_CODE_LIMIT) {
263	size_t index = script / `32`;
264	uint32_t mask = `1` << (script % `32`);
265	IDNScriptWhiteList[index] \|= mask;
266	}
267	}
268
269	void initializeDefaultIDNScriptWhiteList()
270	{
271	const char* defaultIDNScriptWhiteList[`20`] = {
272	"Common",
273	"Inherited",
274	"Arabic",
275	"Armenian",
276	"Bopomofo",
277	"Canadian_Aboriginal",
278	"Devanagari",
279	"Deseret",
280	"Gujarati",
281	"Gurmukhi",
282	"Hangul",
283	"Han",
284	"Hebrew",
285	"Hiragana",
286	"Katakana_Or_Hiragana",
287	"Katakana",
288	"Latin",
289	"Tamil",
290	"Thai",
291	"Yi",
292	};
293	for (const char* scriptName : defaultIDNScriptWhiteList)
294	whiteListIDNScript(scriptName);
295	}
296
297	static bool allCharactersInIDNScriptWhiteList(const UChar* buffer, int32_t length)
298	{
299	loadIDNScriptWhiteList();
300	int32_t i = `0`;
301	Optional<UChar32> previousCodePoint;
302	while (i < length) {
303	UChar32 c;
304	U16_NEXT(buffer, i, length, c)
305	UErrorCode error = U_ZERO_ERROR;
306	UScriptCode script = uscript_getScript(c, &error);
307	if (error != U_ZERO_ERROR) {
308	LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
309	return false;
310	}
311	if (script < `0`) {
312	LOG_ERROR("got negative number for script code from ICU: %d", script);
313	return false;
314	}
315	if (script >= USCRIPT_CODE_LIMIT)
316	return false;
317
318	size_t index = script / `32`;
319	uint32_t mask = `1` << (script % `32`);
320	if (!(IDNScriptWhiteList[index] & mask))
321	return false;
322
323	if (isLookalikeCharacter(previousCodePoint, c))
324	return false;
325	previousCodePoint = c;
326	}
327	return true;
328	}
329
330	static bool isSecondLevelDomainNameAllowedByTLDRules(const UChar* buffer, int32_t length, const WTF::Function<bool(UChar)>& characterIsAllowed)
331	{
332	ASSERT(length > `0`);
333
334	for (int32_t i = length - `1`; i >= `0`; --i) {
335	UChar ch = buffer[i];
336
337	if (characterIsAllowed (ch))
338	continue;
339
340	// Only check the second level domain. Lower level registrars may have different rules.
341	if (ch == `'.'`)
342	break;
343
344	return false;
345	}
346	return true;
347	}
348
349	#define CHECK_RULES_IF_SUFFIX_MATCHES(suffix, function) \
350	{ \
351	static const int32_t suffixLength = sizeof(suffix) / sizeof(suffix[0]); \
352	if (length > suffixLength && !memcmp(buffer + length - suffixLength, suffix, sizeof(suffix))) \
353	return isSecondLevelDomainNameAllowedByTLDRules(buffer, length - suffixLength, function); \
354	}
355
356	static bool isRussianDomainNameCharacter(UChar ch)
357	{
358	// Only modern Russian letters, digits and dashes are allowed.
359	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
360	}
361
362	static bool allCharactersAllowedByTLDRules(const UChar* buffer, int32_t length)
363	{
364	// Skip trailing dot for root domain.
365	if (buffer[length - `1`] == `'.'`)
366	length--;
367
368	// http://cctld.ru/files/pdf/docs/rules_ru-rf.pdf
369	static const UChar cyrillicRF[] = {
370	`'.'`,
371	`0x0440`, // CYRILLIC SMALL LETTER ER
372	`0x0444`, // CYRILLIC SMALL LETTER EF
373	};
374	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRF, isRussianDomainNameCharacter);
375
376	// http://rusnames.ru/rules.pl
377	static const UChar cyrillicRUS[] = {
378	`'.'`,
379	`0x0440`, // CYRILLIC SMALL LETTER ER
380	`0x0443`, // CYRILLIC SMALL LETTER U
381	`0x0441`, // CYRILLIC SMALL LETTER ES
382	};
383	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRUS, isRussianDomainNameCharacter);
384
385	// http://ru.faitid.org/projects/moscow/documents/moskva/idn
386	static const UChar cyrillicMOSKVA[] = {
387	`'.'`,
388	`0x043C`, // CYRILLIC SMALL LETTER EM
389	`0x043E`, // CYRILLIC SMALL LETTER O
390	`0x0441`, // CYRILLIC SMALL LETTER ES
391	`0x043A`, // CYRILLIC SMALL LETTER KA
392	`0x0432`, // CYRILLIC SMALL LETTER VE
393	`0x0430`, // CYRILLIC SMALL LETTER A
394	};
395	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMOSKVA, isRussianDomainNameCharacter);
396
397	// http://www.dotdeti.ru/foruser/docs/regrules.php
398	static const UChar cyrillicDETI[] = {
399	`'.'`,
400	`0x0434`, // CYRILLIC SMALL LETTER DE
401	`0x0435`, // CYRILLIC SMALL LETTER IE
402	`0x0442`, // CYRILLIC SMALL LETTER TE
403	`0x0438`, // CYRILLIC SMALL LETTER I
404	};
405	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicDETI, isRussianDomainNameCharacter);
406
407	// http://corenic.org - rules not published. The word is Russian, so only allowing Russian at this time,
408	// although we may need to revise the checks if this ends up being used with other languages spoken in Russia.
409	static const UChar cyrillicONLAYN[] = {
410	`'.'`,
411	`0x043E`, // CYRILLIC SMALL LETTER O
412	`0x043D`, // CYRILLIC SMALL LETTER EN
413	`0x043B`, // CYRILLIC SMALL LETTER EL
414	`0x0430`, // CYRILLIC SMALL LETTER A
415	`0x0439`, // CYRILLIC SMALL LETTER SHORT I
416	`0x043D`, // CYRILLIC SMALL LETTER EN
417	};
418	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicONLAYN, isRussianDomainNameCharacter);
419
420	// http://corenic.org - same as above.
421	static const UChar cyrillicSAYT[] = {
422	`'.'`,
423	`0x0441`, // CYRILLIC SMALL LETTER ES
424	`0x0430`, // CYRILLIC SMALL LETTER A
425	`0x0439`, // CYRILLIC SMALL LETTER SHORT I
426	`0x0442`, // CYRILLIC SMALL LETTER TE
427	};
428	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSAYT, isRussianDomainNameCharacter);
429
430	// http://pir.org/products/opr-domain/ - rules not published. According to the registry site,
431	// the intended audience is "Russian and other Slavic-speaking markets".
432	// Chrome appears to only allow Russian, so sticking with that for now.
433	static const UChar cyrillicORG[] = {
434	`'.'`,
435	`0x043E`, // CYRILLIC SMALL LETTER O
436	`0x0440`, // CYRILLIC SMALL LETTER ER
437	`0x0433`, // CYRILLIC SMALL LETTER GHE
438	};
439	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicORG, isRussianDomainNameCharacter);
440
441	// http://cctld.by/rules.html
442	static const UChar cyrillicBEL[] = {
443	`'.'`,
444	`0x0431`, // CYRILLIC SMALL LETTER BE
445	`0x0435`, // CYRILLIC SMALL LETTER IE
446	`0x043B`, // CYRILLIC SMALL LETTER EL
447	};
448	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBEL, [](UChar ch) {
449	// Russian and Byelorussian letters, digits and dashes are allowed.
450	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| ch == `0x0456` \|\| ch == `0x045E` \|\| ch == `0x2019` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
451	});
452
453	// http://www.nic.kz/docs/poryadok_vnedreniya_kaz_ru.pdf
454	static const UChar cyrillicKAZ[] = {
455	`'.'`,
456	`0x049B`, // CYRILLIC SMALL LETTER KA WITH DESCENDER
457	`0x0430`, // CYRILLIC SMALL LETTER A
458	`0x0437`, // CYRILLIC SMALL LETTER ZE
459	};
460	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicKAZ, [](UChar ch) {
461	// Kazakh letters, digits and dashes are allowed.
462	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| ch == `0x04D9` \|\| ch == `0x0493` \|\| ch == `0x049B` \|\| ch == `0x04A3` \|\| ch == `0x04E9` \|\| ch == `0x04B1` \|\| ch == `0x04AF` \|\| ch == `0x04BB` \|\| ch == `0x0456` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
463	});
464
465	// http://uanic.net/docs/documents-ukr/Rules%20of%20UKR_v4.0.pdf
466	static const UChar cyrillicUKR[] = {
467	`'.'`,
468	`0x0443`, // CYRILLIC SMALL LETTER U
469	`0x043A`, // CYRILLIC SMALL LETTER KA
470	`0x0440`, // CYRILLIC SMALL LETTER ER
471	};
472	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicUKR, [](UChar ch) {
473	// Russian and Ukrainian letters, digits and dashes are allowed.
474	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| ch == `0x0491` \|\| ch == `0x0404` \|\| ch == `0x0456` \|\| ch == `0x0457` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
475	});
476
477	// http://www.rnids.rs/data/DOKUMENTI/idn-srb-policy-termsofuse-v1.4-eng.pdf
478	static const UChar cyrillicSRB[] = {
479	`'.'`,
480	`0x0441`, // CYRILLIC SMALL LETTER ES
481	`0x0440`, // CYRILLIC SMALL LETTER ER
482	`0x0431`, // CYRILLIC SMALL LETTER BE
483	};
484	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSRB, [](UChar ch) {
485	// Serbian letters, digits and dashes are allowed.
486	return (ch >= `0x0430` && ch <= `0x0438`) \|\| (ch >= `0x043A` && ch <= `0x0448`) \|\| ch == `0x0452` \|\| ch == `0x0458` \|\| ch == `0x0459` \|\| ch == `0x045A` \|\| ch == `0x045B` \|\| ch == `0x045F` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
487	});
488
489	// http://marnet.mk/doc/pravilnik-mk-mkd.pdf
490	static const UChar cyrillicMKD[] = {
491	`'.'`,
492	`0x043C`, // CYRILLIC SMALL LETTER EM
493	`0x043A`, // CYRILLIC SMALL LETTER KA
494	`0x0434`, // CYRILLIC SMALL LETTER DE
495	};
496	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMKD, [](UChar ch) {
497	// Macedonian letters, digits and dashes are allowed.
498	return (ch >= `0x0430` && ch <= `0x0438`) \|\| (ch >= `0x043A` && ch <= `0x0448`) \|\| ch == `0x0453` \|\| ch == `0x0455` \|\| ch == `0x0458` \|\| ch == `0x0459` \|\| ch == `0x045A` \|\| ch == `0x045C` \|\| ch == `0x045F` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
499	});
500
501	// https://www.mon.mn/cs/
502	static const UChar cyrillicMON[] = {
503	`'.'`,
504	`0x043C`, // CYRILLIC SMALL LETTER EM
505	`0x043E`, // CYRILLIC SMALL LETTER O
506	`0x043D`, // CYRILLIC SMALL LETTER EN
507	};
508	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMON, [](UChar ch) {
509	// Mongolian letters, digits and dashes are allowed.
510	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| ch == `0x04E9` \|\| ch == `0x04AF` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
511	});
512
513	// https://www.icann.org/sites/default/files/packages/lgr/lgr-second-level-bulgarian-30aug16-en.html
514	static const UChar cyrillicBG[] = {
515	`'.'`,
516	`0x0431`, // CYRILLIC SMALL LETTER BE
517	`0x0433` // CYRILLIC SMALL LETTER GHE
518	};
519	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBG, [](UChar ch) {
520	return (ch >= `0x0430` && ch <= `0x044A`) \|\| ch == `0x044C` \|\| (ch >= `0x044E` && ch <= `0x0450`) \|\| ch == `0x045D` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
521	});
522
523	// Not a known top level domain with special rules.
524	return false;
525	}
526
527	// Return value of null means no mapping is necessary.
528	Optional<String> mapHostName(const String& hostName, const Optional<URLDecodeFunction>& decodeFunction)
529	{
530	if (hostName.length() > hostNameBufferLength)
531	return String ();
532
533	if (!hostName.length())
534	return String ();
535
536	String string;
537	if (decodeFunction && string.contains(`'%'`))
538	string = (*decodeFunction)(hostName);
539	else
540	string = hostName;
541
542	unsigned length = string.length();
543
544	auto sourceBuffer = string.charactersWithNullTermination();
545
546	UChar destinationBuffer[hostNameBufferLength];
547	UErrorCode uerror = U_ZERO_ERROR;
548	UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
549	int32_t numCharactersConverted = (decodeFunction ? uidna_nameToASCII : uidna_nameToUnicode)(&URLParser::internationalDomainNameTranscoder(), sourceBuffer.data(), length, destinationBuffer, hostNameBufferLength, &processingDetails, &uerror);
550	if (length && (U_FAILURE(uerror) \|\| processingDetails.errors))
551	return nullopt;
552
553	if (numCharactersConverted == static_cast<int32_t>(length) && !memcmp(sourceBuffer.data(), destinationBuffer, length * sizeof(UChar)))
554	return String ();
555
556	if (!decodeFunction && !allCharactersInIDNScriptWhiteList(destinationBuffer, numCharactersConverted) && !allCharactersAllowedByTLDRules(destinationBuffer, numCharactersConverted))
557	return String ();
558
559	return String (destinationBuffer, numCharactersConverted);
560	}
561
562	using MappingRangesVector = Optional<Vector<std::tuple<unsigned, unsigned, String>>>;
563
564	static void collectRangesThatNeedMapping(const String& string, unsigned location, unsigned length, MappingRangesVector& array, const Optional<URLDecodeFunction>& decodeFunction)
565	{
566	// Generally, we want to optimize for the case where there is one host name that does not need mapping.
567	// Therefore, we use null to indicate no mapping here and an empty array to indicate error.
568
569	String substring = string.substringSharingImpl(location, length);
570	Optional<String> host = mapHostName(substring, decodeFunction);
571
572	if (host && !*host)
573	return;
574
575	if (!array)
576	array = Vector<std::tuple<unsigned, unsigned, String>>();
577
578	if (host)
579	array ->constructAndAppend(location, length, *host);
580	}
581
582	static void applyHostNameFunctionToMailToURLString(const String& string, const Optional<URLDecodeFunction>& decodeFunction, MappingRangesVector& array)
583	{
584	// In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' character.
585	// Skip quoted strings so that characters in them don't confuse us.
586	// When we find a '?' character, we are past the part of the URL that contains host names.
587
588	unsigned stringLength = string.length();
589	unsigned current = `0`;
590
591	while (`1`) {
592	// Find start of host name or of quoted string.
593	auto hostNameOrStringStart = string.find([](UChar ch) {
594	return ch == `'"'` \|\| ch == `'@'` \|\| ch == `'?'`;
595	}, current);
596	if (hostNameOrStringStart == notFound)
597	return;
598
599	UChar c = string [hostNameOrStringStart];
600	current = hostNameOrStringStart + `1`;
601
602	if (c == `'?'`)
603	return;
604
605	if (c == `'@'`) {
606	// Find end of host name.
607	unsigned hostNameStart = current;
608	auto hostNameEnd = string.find([](UChar ch) {
609	return ch == `'>'` \|\| ch == `','` \|\| ch == `'?'`;
610	}, current);
611
612	bool done;
613	if (hostNameEnd == notFound) {
614	hostNameEnd = stringLength;
615	done = true;
616	} else {
617	current = hostNameEnd;
618	done = false;
619	}
620
621	// Process host name range.
622	collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction);
623
624	if (done)
625	return;
626	} else {
627	// Skip quoted string.
628	ASSERT(c == `'"'`);
629	while (`1`) {
630	auto escapedCharacterOrStringEnd = string.find([](UChar ch) {
631	return ch == `'"'` \|\| ch == `'\\'`;
632	}, current);
633	if (escapedCharacterOrStringEnd == notFound)
634	return;
635
636	c = string [escapedCharacterOrStringEnd];
637	current = escapedCharacterOrStringEnd + `1`;
638
639	// If we are the end of the string, then break from the string loop back to the host name loop.
640	if (c == `'"'`)
641	break;
642
643	// Skip escaped character.
644	ASSERT(c == `'\\'`);
645	if (current == stringLength)
646	return;
647
648	++current;
649	}
650	}
651	}
652	}
653
654	static void applyHostNameFunctionToURLString(const String& string, const Optional<URLDecodeFunction>& decodeFunction, MappingRangesVector& array)
655	{
656	// Find hostnames. Too bad we can't use any real URL-parsing code to do this,
657	// but we have to do it before doing all the %-escaping, and this is the only
658	// code we have that parses mailto URLs anyway.
659
660	// Maybe we should implement this using a character buffer instead?
661
662	if (protocolIs(string, "mailto")) {
663	applyHostNameFunctionToMailToURLString(string, decodeFunction, array);
664	return;
665	}
666
667	// Find the host name in a hierarchical URL.
668	// It comes after a "://" sequence, with scheme characters preceding.
669	// If ends with the end of the string or a ":", "/", or a "?".
670	// If there is a "@" character, the host part is just the part after the "@".
671	static const char* separator = "://";
672	auto separatorIndex = string.find(separator);
673	if (separatorIndex == notFound)
674	return;
675
676	unsigned authorityStart = separatorIndex + strlen(separator);
677
678	// Check that all characters before the :// are valid scheme characters.
679	auto invalidSchemeCharacter = string.substringSharingImpl(`0`, separatorIndex).find([](UChar ch) {
680	static const char* allowedCharacters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-.";
681	static size_t length = strlen(allowedCharacters);
682	for (size_t i = `0`; i < length; ++i) {
683	if (allowedCharacters[i] == ch)
684	return false;
685	}
686	return true;
687	});
688
689	if (invalidSchemeCharacter != notFound)
690	return;
691
692	unsigned stringLength = string.length();
693
694	// Find terminating character.
695	auto hostNameTerminator = string.find([](UChar ch) {
696	static const char* terminatingCharacters = ":/?#";
697	static size_t length = strlen(terminatingCharacters);
698	for (size_t i = `0`; i < length; ++i) {
699	if (terminatingCharacters[i] == ch)
700	return true;
701	}
702	return false;
703	}, authorityStart);
704	unsigned hostNameEnd = hostNameTerminator == notFound ? stringLength : hostNameTerminator;
705
706	// Find "@" for the start of the host name.
707	auto userInfoTerminator = string.substringSharingImpl(`0`, hostNameEnd).find(`'@'`, authorityStart);
708	unsigned hostNameStart = userInfoTerminator == notFound ? authorityStart : userInfoTerminator + `1`;
709
710	collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction);
711	}
712
713	String mapHostNames(const String& string, const Optional<URLDecodeFunction>& decodeFunction)
714	{
715	// Generally, we want to optimize for the case where there is one host name that does not need mapping.
716
717	if (decodeFunction && string.isAllASCII())
718	return string;
719
720	// Make a list of ranges that actually need mapping.
721	MappingRangesVector hostNameRanges;
722	applyHostNameFunctionToURLString(string, decodeFunction, hostNameRanges);
723	if (!hostNameRanges)
724	return string;
725
726	if (hostNameRanges ->isEmpty())
727	return { };
728
729	// Do the mapping.
730	String result = string;
731	while (!hostNameRanges ->isEmpty()) {
732	unsigned location, length;
733	String mappedHostName;
734	std::tie(location, length, mappedHostName) = hostNameRanges ->takeLast();
735	result = result.replace(location, length, mappedHostName);
736	}
737	return result;
738	}
739
740	static String escapeUnsafeCharacters(const String& sourceBuffer)
741	{
742	unsigned length = sourceBuffer.length();
743
744	Optional<UChar32> previousCodePoint;
745
746	unsigned i;
747	for (i = `0`; i < length; ) {
748	UChar32 c = sourceBuffer.characterStartingAt(i);
749	if (isLookalikeCharacter(previousCodePoint, sourceBuffer.characterStartingAt(i)))
750	break;
751	previousCodePoint = c;
752	i += U16_LENGTH(c);
753	}
754
755	if (i == length)
756	return sourceBuffer;
757
758	Vector<UChar, urlBytesBufferLength> outBuffer;
759
760	outBuffer.grow(i);
761	if (sourceBuffer.is8Bit())
762	StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters8(), i);
763	else
764	StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters16(), i);
765
766	for (; i < length; ) {
767	UChar32 c = sourceBuffer.characterStartingAt(i);
768	unsigned characterLength = U16_LENGTH(c);
769	if (isLookalikeCharacter(previousCodePoint, c)) {
770	uint8_t utf8Buffer[`4`];
771	size_t offset = `0`;
772	UBool failure = false;
773	U8_APPEND(utf8Buffer, offset, `4`, c, failure);
774	ASSERT_UNUSED(failure, !failure);
775
776	for (size_t j = `0`; j < offset; ++j) {
777	outBuffer.append(`'%'`);
778	outBuffer.append(upperNibbleToASCIIHexDigit(utf8Buffer[j]));
779	outBuffer.append(lowerNibbleToASCIIHexDigit(utf8Buffer[j]));
780	}
781	} else {
782	for (unsigned j = `0`; j < characterLength; ++j)
783	outBuffer.append(sourceBuffer [i + j]);
784	}
785	previousCodePoint = c;
786	i += characterLength;
787	}
788
789	return String::adopt(WTFMove(outBuffer));
790	}
791
792	String userVisibleURL(const CString& url)
793	{
794	auto* before = reinterpret_cast<const unsigned char*>(url.data());
795	int length = url.length();
796
797	if (!length)
798	return { };
799
800	bool mayNeedHostNameDecoding = false;
801
802	Checked<int, RecordOverflow> bufferLength = length;
803	bufferLength = bufferLength * `3` + `1`; // The buffer should be large enough to %-escape every character.
804	if (bufferLength.hasOverflowed())
805	return { };
806	Vector<char, urlBytesBufferLength> after(bufferLength.unsafeGet());
807
808	char* q = after.data();
809	{
810	const unsigned char* p = before;
811	for (int i = `0`; i < length; i++) {
812	unsigned char c = p[i];
813	// unescape escape sequences that indicate bytes greater than 0x7f
814	if (c == `'%'` && i + `2` < length && isASCIIHexDigit(p[i + `1`]) && isASCIIHexDigit(p[i + `2`])) {
815	auto u = toASCIIHexValue(p[i + `1`], p[i + `2`]);
816	if (u > `0x7f`) {
817	// unescape
818	*q++ = u;
819	} else {
820	// do not unescape
821	*q++ = p[i];
822	*q++ = p[i + `1`];
823	*q++ = p[i + `2`];
824	}
825	i += `2`;
826	} else {
827	*q++ = c;
828
829	// Check for "xn--" in an efficient, non-case-sensitive, way.
830	if (c == `'-'` && i >= `3` && !mayNeedHostNameDecoding && (q[-`4`] \| `0x20`) == `'x'` && (q[-`3`] \| `0x20`) == `'n'` && q[-`2`] == `'-'`)
831	mayNeedHostNameDecoding = true;
832	}
833	}
834	*q = `'\0'`;
835	}
836
837	// Check string to see if it can be converted to display using UTF-8
838	String result = String::fromUTF8(after.data());
839	if (!result) {
840	// Could not convert to UTF-8.
841	// Convert characters greater than 0x7f to escape sequences.
842	// Shift current string to the end of the buffer
843	// then we will copy back bytes to the start of the buffer
844	// as we convert.
845	int afterlength = q - after.data();
846	char* p = after.data() + bufferLength.unsafeGet() - afterlength - `1`;
847	memmove(p, after.data(), afterlength + `1`); // copies trailing '\0'
848	char* q = after.data();
849	while (*p) {
850	unsigned char c = *p;
851	if (c > `0x7f`) {
852	*q++ = `'%'`;
853	*q++ = upperNibbleToASCIIHexDigit(c);
854	*q++ = lowerNibbleToASCIIHexDigit(c);
855	} else
856	q++ = p;
857	p++;
858	}
859	*q = `'\0'`;
860	// Note: after.data() points to a null-terminated, pure ASCII string.
861	result = String::fromUTF8(after.data());
862	ASSERT(!!result);
863	}
864
865	// Note: result is UTF–16 string, created from either a valid UTF-8 string,
866	// or a pure ASCII string (where all bytes with the high bit set are
867	// percent-encoded).
868
869	if (mayNeedHostNameDecoding) {
870	// FIXME: Is it good to ignore the failure of mapHostNames and keep result intact?
871	auto mappedResult = mapHostNames(result, nullopt);
872	if (!!mappedResult)
873	result = mappedResult;
874	}
875
876	return escapeUnsafeCharacters(normalizedNFC(result));
877	}
878
879	} // namespace URLHelpers
880	} // namespace WTF
881

Browse the source code of webcore/Source/WTF/wtf/URLHelpers.cpp