1/*
2 * Copyright (C) 2005-2019 Apple Inc. All rights reserved.
3 * Copyright (C) 2018 Igalia S.L.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Neither the name of Apple Inc. ("Apple") nor the names of
15 * its contributors may be used to endorse or promote products derived
16 * from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#include "config.h"
31#include "URLHelpers.h"
32
33#include "URLParser.h"
34#include <mutex>
35#include <unicode/uidna.h>
36#include <unicode/uscript.h>
37#include <wtf/Optional.h>
38#include <wtf/text/WTFString.h>
39
40namespace WTF {
41namespace URLHelpers {
42
43// Needs to be big enough to hold an IDN-encoded name.
44// For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
45const unsigned hostNameBufferLength = 2048;
46const unsigned urlBytesBufferLength = 2048;
47
48static uint32_t IDNScriptWhiteList[(USCRIPT_CODE_LIMIT + 31) / 32];
49
50#if !PLATFORM(COCOA)
51
52// Cocoa has an implementation that uses a whitelist in /Library or ~/Library,
53// if it exists.
54void loadIDNScriptWhiteList()
55{
56 static std::once_flag flag;
57 std::call_once(flag, initializeDefaultIDNScriptWhiteList);
58}
59
60#endif // !PLATFORM(COCOA)
61
62static bool isArmenianLookalikeCharacter(UChar32 codePoint)
63{
64 return codePoint == 0x0548 || codePoint == 0x054D || codePoint == 0x0578 || codePoint == 0x057D;
65}
66
67static bool isArmenianScriptCharacter(UChar32 codePoint)
68{
69 UErrorCode error = U_ZERO_ERROR;
70 UScriptCode script = uscript_getScript(codePoint, &error);
71 if (error != U_ZERO_ERROR) {
72 LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
73 return false;
74 }
75
76 return script == USCRIPT_ARMENIAN;
77}
78
79template<typename CharacterType> inline bool isASCIIDigitOrValidHostCharacter(CharacterType charCode)
80{
81 if (!isASCIIDigitOrPunctuation(charCode))
82 return false;
83
84 // Things the URL Parser rejects:
85 switch (charCode) {
86 case '#':
87 case '%':
88 case '/':
89 case ':':
90 case '?':
91 case '@':
92 case '[':
93 case '\\':
94 case ']':
95 return false;
96 default:
97 return true;
98 }
99}
100
101static bool isLookalikeCharacter(const Optional<UChar32>& previousCodePoint, UChar32 charCode)
102{
103 // This function treats the following as unsafe, lookalike characters:
104 // any non-printable character, any character considered as whitespace,
105 // any ignorable character, and emoji characters related to locks.
106
107 // We also considered the characters in Mozilla's blacklist <http://kb.mozillazine.org/Network.IDN.blacklist_chars>.
108
109 // Some of the characters here will never appear once ICU has encoded.
110 // For example, ICU transforms most spaces into an ASCII space and most
111 // slashes into an ASCII solidus. But one of the two callers uses this
112 // on characters that have not been processed by ICU, so they are needed here.
113
114 if (!u_isprint(charCode) || u_isUWhiteSpace(charCode) || u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT))
115 return true;
116
117 switch (charCode) {
118 case 0x00BC: /* VULGAR FRACTION ONE QUARTER */
119 case 0x00BD: /* VULGAR FRACTION ONE HALF */
120 case 0x00BE: /* VULGAR FRACTION THREE QUARTERS */
121 case 0x00ED: /* LATIN SMALL LETTER I WITH ACUTE */
122 /* 0x0131 LATIN SMALL LETTER DOTLESS I is intentionally not considered a lookalike character because it is visually distinguishable from i and it has legitimate use in the Turkish language. */
123 case 0x01C0: /* LATIN LETTER DENTAL CLICK */
124 case 0x01C3: /* LATIN LETTER RETROFLEX CLICK */
125 case 0x0251: /* LATIN SMALL LETTER ALPHA */
126 case 0x0261: /* LATIN SMALL LETTER SCRIPT G */
127 case 0x027E: /* LATIN SMALL LETTER R WITH FISHHOOK */
128 case 0x02D0: /* MODIFIER LETTER TRIANGULAR COLON */
129 case 0x0335: /* COMBINING SHORT STROKE OVERLAY */
130 case 0x0337: /* COMBINING SHORT SOLIDUS OVERLAY */
131 case 0x0338: /* COMBINING LONG SOLIDUS OVERLAY */
132 case 0x0589: /* ARMENIAN FULL STOP */
133 case 0x05B4: /* HEBREW POINT HIRIQ */
134 case 0x05BC: /* HEBREW POINT DAGESH OR MAPIQ */
135 case 0x05C3: /* HEBREW PUNCTUATION SOF PASUQ */
136 case 0x05F4: /* HEBREW PUNCTUATION GERSHAYIM */
137 case 0x0609: /* ARABIC-INDIC PER MILLE SIGN */
138 case 0x060A: /* ARABIC-INDIC PER TEN THOUSAND SIGN */
139 case 0x0650: /* ARABIC KASRA */
140 case 0x0660: /* ARABIC INDIC DIGIT ZERO */
141 case 0x066A: /* ARABIC PERCENT SIGN */
142 case 0x06D4: /* ARABIC FULL STOP */
143 case 0x06F0: /* EXTENDED ARABIC INDIC DIGIT ZERO */
144 case 0x0701: /* SYRIAC SUPRALINEAR FULL STOP */
145 case 0x0702: /* SYRIAC SUBLINEAR FULL STOP */
146 case 0x0703: /* SYRIAC SUPRALINEAR COLON */
147 case 0x0704: /* SYRIAC SUBLINEAR COLON */
148 case 0x1735: /* PHILIPPINE SINGLE PUNCTUATION */
149 case 0x1D04: /* LATIN LETTER SMALL CAPITAL C */
150 case 0x1D0F: /* LATIN LETTER SMALL CAPITAL O */
151 case 0x1D1C: /* LATIN LETTER SMALL CAPITAL U */
152 case 0x1D20: /* LATIN LETTER SMALL CAPITAL V */
153 case 0x1D21: /* LATIN LETTER SMALL CAPITAL W */
154 case 0x1D22: /* LATIN LETTER SMALL CAPITAL Z */
155 case 0x1ECD: /* LATIN SMALL LETTER O WITH DOT BELOW */
156 case 0x2010: /* HYPHEN */
157 case 0x2011: /* NON-BREAKING HYPHEN */
158 case 0x2024: /* ONE DOT LEADER */
159 case 0x2027: /* HYPHENATION POINT */
160 case 0x2039: /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
161 case 0x203A: /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
162 case 0x2041: /* CARET INSERTION POINT */
163 case 0x2044: /* FRACTION SLASH */
164 case 0x2052: /* COMMERCIAL MINUS SIGN */
165 case 0x2153: /* VULGAR FRACTION ONE THIRD */
166 case 0x2154: /* VULGAR FRACTION TWO THIRDS */
167 case 0x2155: /* VULGAR FRACTION ONE FIFTH */
168 case 0x2156: /* VULGAR FRACTION TWO FIFTHS */
169 case 0x2157: /* VULGAR FRACTION THREE FIFTHS */
170 case 0x2158: /* VULGAR FRACTION FOUR FIFTHS */
171 case 0x2159: /* VULGAR FRACTION ONE SIXTH */
172 case 0x215A: /* VULGAR FRACTION FIVE SIXTHS */
173 case 0x215B: /* VULGAR FRACTION ONE EIGHT */
174 case 0x215C: /* VULGAR FRACTION THREE EIGHTHS */
175 case 0x215D: /* VULGAR FRACTION FIVE EIGHTHS */
176 case 0x215E: /* VULGAR FRACTION SEVEN EIGHTHS */
177 case 0x215F: /* FRACTION NUMERATOR ONE */
178 case 0x2212: /* MINUS SIGN */
179 case 0x2215: /* DIVISION SLASH */
180 case 0x2216: /* SET MINUS */
181 case 0x2236: /* RATIO */
182 case 0x233F: /* APL FUNCTIONAL SYMBOL SLASH BAR */
183 case 0x23AE: /* INTEGRAL EXTENSION */
184 case 0x244A: /* OCR DOUBLE BACKSLASH */
185 case 0x2571: /* BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT */
186 case 0x2572: /* BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT */
187 case 0x29F6: /* SOLIDUS WITH OVERBAR */
188 case 0x29F8: /* BIG SOLIDUS */
189 case 0x2AFB: /* TRIPLE SOLIDUS BINARY RELATION */
190 case 0x2AFD: /* DOUBLE SOLIDUS OPERATOR */
191 case 0x2FF0: /* IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT */
192 case 0x2FF1: /* IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO BELOW */
193 case 0x2FF2: /* IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT */
194 case 0x2FF3: /* IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW */
195 case 0x2FF4: /* IDEOGRAPHIC DESCRIPTION CHARACTER FULL SURROUND */
196 case 0x2FF5: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE */
197 case 0x2FF6: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM BELOW */
198 case 0x2FF7: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LEFT */
199 case 0x2FF8: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER LEFT */
200 case 0x2FF9: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER RIGHT */
201 case 0x2FFA: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER LEFT */
202 case 0x2FFB: /* IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID */
203 case 0x3002: /* IDEOGRAPHIC FULL STOP */
204 case 0x3008: /* LEFT ANGLE BRACKET */
205 case 0x3014: /* LEFT TORTOISE SHELL BRACKET */
206 case 0x3015: /* RIGHT TORTOISE SHELL BRACKET */
207 case 0x3033: /* VERTICAL KANA REPEAT MARK UPPER HALF */
208 case 0x3035: /* VERTICAL KANA REPEAT MARK LOWER HALF */
209 case 0x321D: /* PARENTHESIZED KOREAN CHARACTER OJEON */
210 case 0x321E: /* PARENTHESIZED KOREAN CHARACTER O HU */
211 case 0x33AE: /* SQUARE RAD OVER S */
212 case 0x33AF: /* SQUARE RAD OVER S SQUARED */
213 case 0x33C6: /* SQUARE C OVER KG */
214 case 0x33DF: /* SQUARE A OVER M */
215 case 0x05B9: /* HEBREW POINT HOLAM */
216 case 0x05BA: /* HEBREW POINT HOLAM HASER FOR VAV */
217 case 0x05C1: /* HEBREW POINT SHIN DOT */
218 case 0x05C2: /* HEBREW POINT SIN DOT */
219 case 0x05C4: /* HEBREW MARK UPPER DOT */
220 case 0xA731: /* LATIN LETTER SMALL CAPITAL S */
221 case 0xA771: /* LATIN SMALL LETTER DUM */
222 case 0xA789: /* MODIFIER LETTER COLON */
223 case 0xFE14: /* PRESENTATION FORM FOR VERTICAL SEMICOLON */
224 case 0xFE15: /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
225 case 0xFE3F: /* PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET */
226 case 0xFE5D: /* SMALL LEFT TORTOISE SHELL BRACKET */
227 case 0xFE5E: /* SMALL RIGHT TORTOISE SHELL BRACKET */
228 case 0xFF0E: /* FULLWIDTH FULL STOP */
229 case 0xFF0F: /* FULL WIDTH SOLIDUS */
230 case 0xFF61: /* HALFWIDTH IDEOGRAPHIC FULL STOP */
231 case 0xFFFC: /* OBJECT REPLACEMENT CHARACTER */
232 case 0xFFFD: /* REPLACEMENT CHARACTER */
233 case 0x1F50F: /* LOCK WITH INK PEN */
234 case 0x1F510: /* CLOSED LOCK WITH KEY */
235 case 0x1F511: /* KEY */
236 case 0x1F512: /* LOCK */
237 case 0x1F513: /* OPEN LOCK */
238 return true;
239 case 0x0307: /* COMBINING DOT ABOVE */
240 return previousCodePoint == 0x0237 /* LATIN SMALL LETTER DOTLESS J */
241 || previousCodePoint == 0x0131 /* LATIN SMALL LETTER DOTLESS I */
242 || previousCodePoint == 0x05D5; /* HEBREW LETTER VAV */
243 case 0x0548: /* ARMENIAN CAPITAL LETTER VO */
244 case 0x054D: /* ARMENIAN CAPITAL LETTER SEH */
245 case 0x0578: /* ARMENIAN SMALL LETTER VO */
246 case 0x057D: /* ARMENIAN SMALL LETTER SEH */
247 return previousCodePoint
248 && !isASCIIDigitOrValidHostCharacter(previousCodePoint.value())
249 && !isArmenianScriptCharacter(previousCodePoint.value());
250 case '.':
251 return false;
252 default:
253 return previousCodePoint
254 && isArmenianLookalikeCharacter(previousCodePoint.value())
255 && !(isArmenianScriptCharacter(charCode) || isASCIIDigitOrValidHostCharacter(charCode));
256 }
257}
258
259void whiteListIDNScript(const char* scriptName)
260{
261 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, scriptName);
262 if (script >= 0 && script < USCRIPT_CODE_LIMIT) {
263 size_t index = script / 32;
264 uint32_t mask = 1 << (script % 32);
265 IDNScriptWhiteList[index] |= mask;
266 }
267}
268
269void initializeDefaultIDNScriptWhiteList()
270{
271 const char* defaultIDNScriptWhiteList[20] = {
272 "Common",
273 "Inherited",
274 "Arabic",
275 "Armenian",
276 "Bopomofo",
277 "Canadian_Aboriginal",
278 "Devanagari",
279 "Deseret",
280 "Gujarati",
281 "Gurmukhi",
282 "Hangul",
283 "Han",
284 "Hebrew",
285 "Hiragana",
286 "Katakana_Or_Hiragana",
287 "Katakana",
288 "Latin",
289 "Tamil",
290 "Thai",
291 "Yi",
292 };
293 for (const char* scriptName : defaultIDNScriptWhiteList)
294 whiteListIDNScript(scriptName);
295}
296
297static bool allCharactersInIDNScriptWhiteList(const UChar* buffer, int32_t length)
298{
299 loadIDNScriptWhiteList();
300 int32_t i = 0;
301 Optional<UChar32> previousCodePoint;
302 while (i < length) {
303 UChar32 c;
304 U16_NEXT(buffer, i, length, c)
305 UErrorCode error = U_ZERO_ERROR;
306 UScriptCode script = uscript_getScript(c, &error);
307 if (error != U_ZERO_ERROR) {
308 LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
309 return false;
310 }
311 if (script < 0) {
312 LOG_ERROR("got negative number for script code from ICU: %d", script);
313 return false;
314 }
315 if (script >= USCRIPT_CODE_LIMIT)
316 return false;
317
318 size_t index = script / 32;
319 uint32_t mask = 1 << (script % 32);
320 if (!(IDNScriptWhiteList[index] & mask))
321 return false;
322
323 if (isLookalikeCharacter(previousCodePoint, c))
324 return false;
325 previousCodePoint = c;
326 }
327 return true;
328}
329
330static bool isSecondLevelDomainNameAllowedByTLDRules(const UChar* buffer, int32_t length, const WTF::Function<bool(UChar)>& characterIsAllowed)
331{
332 ASSERT(length > 0);
333
334 for (int32_t i = length - 1; i >= 0; --i) {
335 UChar ch = buffer[i];
336
337 if (characterIsAllowed(ch))
338 continue;
339
340 // Only check the second level domain. Lower level registrars may have different rules.
341 if (ch == '.')
342 break;
343
344 return false;
345 }
346 return true;
347}
348
349#define CHECK_RULES_IF_SUFFIX_MATCHES(suffix, function) \
350 { \
351 static const int32_t suffixLength = sizeof(suffix) / sizeof(suffix[0]); \
352 if (length > suffixLength && !memcmp(buffer + length - suffixLength, suffix, sizeof(suffix))) \
353 return isSecondLevelDomainNameAllowedByTLDRules(buffer, length - suffixLength, function); \
354 }
355
356static bool isRussianDomainNameCharacter(UChar ch)
357{
358 // Only modern Russian letters, digits and dashes are allowed.
359 return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || isASCIIDigit(ch) || ch == '-';
360}
361
362static bool allCharactersAllowedByTLDRules(const UChar* buffer, int32_t length)
363{
364 // Skip trailing dot for root domain.
365 if (buffer[length - 1] == '.')
366 length--;
367
368 // http://cctld.ru/files/pdf/docs/rules_ru-rf.pdf
369 static const UChar cyrillicRF[] = {
370 '.',
371 0x0440, // CYRILLIC SMALL LETTER ER
372 0x0444, // CYRILLIC SMALL LETTER EF
373 };
374 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRF, isRussianDomainNameCharacter);
375
376 // http://rusnames.ru/rules.pl
377 static const UChar cyrillicRUS[] = {
378 '.',
379 0x0440, // CYRILLIC SMALL LETTER ER
380 0x0443, // CYRILLIC SMALL LETTER U
381 0x0441, // CYRILLIC SMALL LETTER ES
382 };
383 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRUS, isRussianDomainNameCharacter);
384
385 // http://ru.faitid.org/projects/moscow/documents/moskva/idn
386 static const UChar cyrillicMOSKVA[] = {
387 '.',
388 0x043C, // CYRILLIC SMALL LETTER EM
389 0x043E, // CYRILLIC SMALL LETTER O
390 0x0441, // CYRILLIC SMALL LETTER ES
391 0x043A, // CYRILLIC SMALL LETTER KA
392 0x0432, // CYRILLIC SMALL LETTER VE
393 0x0430, // CYRILLIC SMALL LETTER A
394 };
395 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMOSKVA, isRussianDomainNameCharacter);
396
397 // http://www.dotdeti.ru/foruser/docs/regrules.php
398 static const UChar cyrillicDETI[] = {
399 '.',
400 0x0434, // CYRILLIC SMALL LETTER DE
401 0x0435, // CYRILLIC SMALL LETTER IE
402 0x0442, // CYRILLIC SMALL LETTER TE
403 0x0438, // CYRILLIC SMALL LETTER I
404 };
405 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicDETI, isRussianDomainNameCharacter);
406
407 // http://corenic.org - rules not published. The word is Russian, so only allowing Russian at this time,
408 // although we may need to revise the checks if this ends up being used with other languages spoken in Russia.
409 static const UChar cyrillicONLAYN[] = {
410 '.',
411 0x043E, // CYRILLIC SMALL LETTER O
412 0x043D, // CYRILLIC SMALL LETTER EN
413 0x043B, // CYRILLIC SMALL LETTER EL
414 0x0430, // CYRILLIC SMALL LETTER A
415 0x0439, // CYRILLIC SMALL LETTER SHORT I
416 0x043D, // CYRILLIC SMALL LETTER EN
417 };
418 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicONLAYN, isRussianDomainNameCharacter);
419
420 // http://corenic.org - same as above.
421 static const UChar cyrillicSAYT[] = {
422 '.',
423 0x0441, // CYRILLIC SMALL LETTER ES
424 0x0430, // CYRILLIC SMALL LETTER A
425 0x0439, // CYRILLIC SMALL LETTER SHORT I
426 0x0442, // CYRILLIC SMALL LETTER TE
427 };
428 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSAYT, isRussianDomainNameCharacter);
429
430 // http://pir.org/products/opr-domain/ - rules not published. According to the registry site,
431 // the intended audience is "Russian and other Slavic-speaking markets".
432 // Chrome appears to only allow Russian, so sticking with that for now.
433 static const UChar cyrillicORG[] = {
434 '.',
435 0x043E, // CYRILLIC SMALL LETTER O
436 0x0440, // CYRILLIC SMALL LETTER ER
437 0x0433, // CYRILLIC SMALL LETTER GHE
438 };
439 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicORG, isRussianDomainNameCharacter);
440
441 // http://cctld.by/rules.html
442 static const UChar cyrillicBEL[] = {
443 '.',
444 0x0431, // CYRILLIC SMALL LETTER BE
445 0x0435, // CYRILLIC SMALL LETTER IE
446 0x043B, // CYRILLIC SMALL LETTER EL
447 };
448 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBEL, [](UChar ch) {
449 // Russian and Byelorussian letters, digits and dashes are allowed.
450 return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x0456 || ch == 0x045E || ch == 0x2019 || isASCIIDigit(ch) || ch == '-';
451 });
452
453 // http://www.nic.kz/docs/poryadok_vnedreniya_kaz_ru.pdf
454 static const UChar cyrillicKAZ[] = {
455 '.',
456 0x049B, // CYRILLIC SMALL LETTER KA WITH DESCENDER
457 0x0430, // CYRILLIC SMALL LETTER A
458 0x0437, // CYRILLIC SMALL LETTER ZE
459 };
460 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicKAZ, [](UChar ch) {
461 // Kazakh letters, digits and dashes are allowed.
462 return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x04D9 || ch == 0x0493 || ch == 0x049B || ch == 0x04A3 || ch == 0x04E9 || ch == 0x04B1 || ch == 0x04AF || ch == 0x04BB || ch == 0x0456 || isASCIIDigit(ch) || ch == '-';
463 });
464
465 // http://uanic.net/docs/documents-ukr/Rules%20of%20UKR_v4.0.pdf
466 static const UChar cyrillicUKR[] = {
467 '.',
468 0x0443, // CYRILLIC SMALL LETTER U
469 0x043A, // CYRILLIC SMALL LETTER KA
470 0x0440, // CYRILLIC SMALL LETTER ER
471 };
472 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicUKR, [](UChar ch) {
473 // Russian and Ukrainian letters, digits and dashes are allowed.
474 return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x0491 || ch == 0x0404 || ch == 0x0456 || ch == 0x0457 || isASCIIDigit(ch) || ch == '-';
475 });
476
477 // http://www.rnids.rs/data/DOKUMENTI/idn-srb-policy-termsofuse-v1.4-eng.pdf
478 static const UChar cyrillicSRB[] = {
479 '.',
480 0x0441, // CYRILLIC SMALL LETTER ES
481 0x0440, // CYRILLIC SMALL LETTER ER
482 0x0431, // CYRILLIC SMALL LETTER BE
483 };
484 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSRB, [](UChar ch) {
485 // Serbian letters, digits and dashes are allowed.
486 return (ch >= 0x0430 && ch <= 0x0438) || (ch >= 0x043A && ch <= 0x0448) || ch == 0x0452 || ch == 0x0458 || ch == 0x0459 || ch == 0x045A || ch == 0x045B || ch == 0x045F || isASCIIDigit(ch) || ch == '-';
487 });
488
489 // http://marnet.mk/doc/pravilnik-mk-mkd.pdf
490 static const UChar cyrillicMKD[] = {
491 '.',
492 0x043C, // CYRILLIC SMALL LETTER EM
493 0x043A, // CYRILLIC SMALL LETTER KA
494 0x0434, // CYRILLIC SMALL LETTER DE
495 };
496 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMKD, [](UChar ch) {
497 // Macedonian letters, digits and dashes are allowed.
498 return (ch >= 0x0430 && ch <= 0x0438) || (ch >= 0x043A && ch <= 0x0448) || ch == 0x0453 || ch == 0x0455 || ch == 0x0458 || ch == 0x0459 || ch == 0x045A || ch == 0x045C || ch == 0x045F || isASCIIDigit(ch) || ch == '-';
499 });
500
501 // https://www.mon.mn/cs/
502 static const UChar cyrillicMON[] = {
503 '.',
504 0x043C, // CYRILLIC SMALL LETTER EM
505 0x043E, // CYRILLIC SMALL LETTER O
506 0x043D, // CYRILLIC SMALL LETTER EN
507 };
508 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMON, [](UChar ch) {
509 // Mongolian letters, digits and dashes are allowed.
510 return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x04E9 || ch == 0x04AF || isASCIIDigit(ch) || ch == '-';
511 });
512
513 // https://www.icann.org/sites/default/files/packages/lgr/lgr-second-level-bulgarian-30aug16-en.html
514 static const UChar cyrillicBG[] = {
515 '.',
516 0x0431, // CYRILLIC SMALL LETTER BE
517 0x0433 // CYRILLIC SMALL LETTER GHE
518 };
519 CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBG, [](UChar ch) {
520 return (ch >= 0x0430 && ch <= 0x044A) || ch == 0x044C || (ch >= 0x044E && ch <= 0x0450) || ch == 0x045D || isASCIIDigit(ch) || ch == '-';
521 });
522
523 // Not a known top level domain with special rules.
524 return false;
525}
526
527// Return value of null means no mapping is necessary.
528Optional<String> mapHostName(const String& hostName, const Optional<URLDecodeFunction>& decodeFunction)
529{
530 if (hostName.length() > hostNameBufferLength)
531 return String();
532
533 if (!hostName.length())
534 return String();
535
536 String string;
537 if (decodeFunction && string.contains('%'))
538 string = (*decodeFunction)(hostName);
539 else
540 string = hostName;
541
542 unsigned length = string.length();
543
544 auto sourceBuffer = string.charactersWithNullTermination();
545
546 UChar destinationBuffer[hostNameBufferLength];
547 UErrorCode uerror = U_ZERO_ERROR;
548 UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
549 int32_t numCharactersConverted = (decodeFunction ? uidna_nameToASCII : uidna_nameToUnicode)(&URLParser::internationalDomainNameTranscoder(), sourceBuffer.data(), length, destinationBuffer, hostNameBufferLength, &processingDetails, &uerror);
550 if (length && (U_FAILURE(uerror) || processingDetails.errors))
551 return nullopt;
552
553 if (numCharactersConverted == static_cast<int32_t>(length) && !memcmp(sourceBuffer.data(), destinationBuffer, length * sizeof(UChar)))
554 return String();
555
556 if (!decodeFunction && !allCharactersInIDNScriptWhiteList(destinationBuffer, numCharactersConverted) && !allCharactersAllowedByTLDRules(destinationBuffer, numCharactersConverted))
557 return String();
558
559 return String(destinationBuffer, numCharactersConverted);
560}
561
562using MappingRangesVector = Optional<Vector<std::tuple<unsigned, unsigned, String>>>;
563
564static void collectRangesThatNeedMapping(const String& string, unsigned location, unsigned length, MappingRangesVector& array, const Optional<URLDecodeFunction>& decodeFunction)
565{
566 // Generally, we want to optimize for the case where there is one host name that does not need mapping.
567 // Therefore, we use null to indicate no mapping here and an empty array to indicate error.
568
569 String substring = string.substringSharingImpl(location, length);
570 Optional<String> host = mapHostName(substring, decodeFunction);
571
572 if (host && !*host)
573 return;
574
575 if (!array)
576 array = Vector<std::tuple<unsigned, unsigned, String>>();
577
578 if (host)
579 array->constructAndAppend(location, length, *host);
580}
581
582static void applyHostNameFunctionToMailToURLString(const String& string, const Optional<URLDecodeFunction>& decodeFunction, MappingRangesVector& array)
583{
584 // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' character.
585 // Skip quoted strings so that characters in them don't confuse us.
586 // When we find a '?' character, we are past the part of the URL that contains host names.
587
588 unsigned stringLength = string.length();
589 unsigned current = 0;
590
591 while (1) {
592 // Find start of host name or of quoted string.
593 auto hostNameOrStringStart = string.find([](UChar ch) {
594 return ch == '"' || ch == '@' || ch == '?';
595 }, current);
596 if (hostNameOrStringStart == notFound)
597 return;
598
599 UChar c = string[hostNameOrStringStart];
600 current = hostNameOrStringStart + 1;
601
602 if (c == '?')
603 return;
604
605 if (c == '@') {
606 // Find end of host name.
607 unsigned hostNameStart = current;
608 auto hostNameEnd = string.find([](UChar ch) {
609 return ch == '>' || ch == ',' || ch == '?';
610 }, current);
611
612 bool done;
613 if (hostNameEnd == notFound) {
614 hostNameEnd = stringLength;
615 done = true;
616 } else {
617 current = hostNameEnd;
618 done = false;
619 }
620
621 // Process host name range.
622 collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction);
623
624 if (done)
625 return;
626 } else {
627 // Skip quoted string.
628 ASSERT(c == '"');
629 while (1) {
630 auto escapedCharacterOrStringEnd = string.find([](UChar ch) {
631 return ch == '"' || ch == '\\';
632 }, current);
633 if (escapedCharacterOrStringEnd == notFound)
634 return;
635
636 c = string[escapedCharacterOrStringEnd];
637 current = escapedCharacterOrStringEnd + 1;
638
639 // If we are the end of the string, then break from the string loop back to the host name loop.
640 if (c == '"')
641 break;
642
643 // Skip escaped character.
644 ASSERT(c == '\\');
645 if (current == stringLength)
646 return;
647
648 ++current;
649 }
650 }
651 }
652}
653
654static void applyHostNameFunctionToURLString(const String& string, const Optional<URLDecodeFunction>& decodeFunction, MappingRangesVector& array)
655{
656 // Find hostnames. Too bad we can't use any real URL-parsing code to do this,
657 // but we have to do it before doing all the %-escaping, and this is the only
658 // code we have that parses mailto URLs anyway.
659
660 // Maybe we should implement this using a character buffer instead?
661
662 if (protocolIs(string, "mailto")) {
663 applyHostNameFunctionToMailToURLString(string, decodeFunction, array);
664 return;
665 }
666
667 // Find the host name in a hierarchical URL.
668 // It comes after a "://" sequence, with scheme characters preceding.
669 // If ends with the end of the string or a ":", "/", or a "?".
670 // If there is a "@" character, the host part is just the part after the "@".
671 static const char* separator = "://";
672 auto separatorIndex = string.find(separator);
673 if (separatorIndex == notFound)
674 return;
675
676 unsigned authorityStart = separatorIndex + strlen(separator);
677
678 // Check that all characters before the :// are valid scheme characters.
679 auto invalidSchemeCharacter = string.substringSharingImpl(0, separatorIndex).find([](UChar ch) {
680 static const char* allowedCharacters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-.";
681 static size_t length = strlen(allowedCharacters);
682 for (size_t i = 0; i < length; ++i) {
683 if (allowedCharacters[i] == ch)
684 return false;
685 }
686 return true;
687 });
688
689 if (invalidSchemeCharacter != notFound)
690 return;
691
692 unsigned stringLength = string.length();
693
694 // Find terminating character.
695 auto hostNameTerminator = string.find([](UChar ch) {
696 static const char* terminatingCharacters = ":/?#";
697 static size_t length = strlen(terminatingCharacters);
698 for (size_t i = 0; i < length; ++i) {
699 if (terminatingCharacters[i] == ch)
700 return true;
701 }
702 return false;
703 }, authorityStart);
704 unsigned hostNameEnd = hostNameTerminator == notFound ? stringLength : hostNameTerminator;
705
706 // Find "@" for the start of the host name.
707 auto userInfoTerminator = string.substringSharingImpl(0, hostNameEnd).find('@', authorityStart);
708 unsigned hostNameStart = userInfoTerminator == notFound ? authorityStart : userInfoTerminator + 1;
709
710 collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction);
711}
712
713String mapHostNames(const String& string, const Optional<URLDecodeFunction>& decodeFunction)
714{
715 // Generally, we want to optimize for the case where there is one host name that does not need mapping.
716
717 if (decodeFunction && string.isAllASCII())
718 return string;
719
720 // Make a list of ranges that actually need mapping.
721 MappingRangesVector hostNameRanges;
722 applyHostNameFunctionToURLString(string, decodeFunction, hostNameRanges);
723 if (!hostNameRanges)
724 return string;
725
726 if (hostNameRanges->isEmpty())
727 return { };
728
729 // Do the mapping.
730 String result = string;
731 while (!hostNameRanges->isEmpty()) {
732 unsigned location, length;
733 String mappedHostName;
734 std::tie(location, length, mappedHostName) = hostNameRanges->takeLast();
735 result = result.replace(location, length, mappedHostName);
736 }
737 return result;
738}
739
740static String escapeUnsafeCharacters(const String& sourceBuffer)
741{
742 unsigned length = sourceBuffer.length();
743
744 Optional<UChar32> previousCodePoint;
745
746 unsigned i;
747 for (i = 0; i < length; ) {
748 UChar32 c = sourceBuffer.characterStartingAt(i);
749 if (isLookalikeCharacter(previousCodePoint, sourceBuffer.characterStartingAt(i)))
750 break;
751 previousCodePoint = c;
752 i += U16_LENGTH(c);
753 }
754
755 if (i == length)
756 return sourceBuffer;
757
758 Vector<UChar, urlBytesBufferLength> outBuffer;
759
760 outBuffer.grow(i);
761 if (sourceBuffer.is8Bit())
762 StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters8(), i);
763 else
764 StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters16(), i);
765
766 for (; i < length; ) {
767 UChar32 c = sourceBuffer.characterStartingAt(i);
768 unsigned characterLength = U16_LENGTH(c);
769 if (isLookalikeCharacter(previousCodePoint, c)) {
770 uint8_t utf8Buffer[4];
771 size_t offset = 0;
772 UBool failure = false;
773 U8_APPEND(utf8Buffer, offset, 4, c, failure);
774 ASSERT_UNUSED(failure, !failure);
775
776 for (size_t j = 0; j < offset; ++j) {
777 outBuffer.append('%');
778 outBuffer.append(upperNibbleToASCIIHexDigit(utf8Buffer[j]));
779 outBuffer.append(lowerNibbleToASCIIHexDigit(utf8Buffer[j]));
780 }
781 } else {
782 for (unsigned j = 0; j < characterLength; ++j)
783 outBuffer.append(sourceBuffer[i + j]);
784 }
785 previousCodePoint = c;
786 i += characterLength;
787 }
788
789 return String::adopt(WTFMove(outBuffer));
790}
791
792String userVisibleURL(const CString& url)
793{
794 auto* before = reinterpret_cast<const unsigned char*>(url.data());
795 int length = url.length();
796
797 if (!length)
798 return { };
799
800 bool mayNeedHostNameDecoding = false;
801
802 Checked<int, RecordOverflow> bufferLength = length;
803 bufferLength = bufferLength * 3 + 1; // The buffer should be large enough to %-escape every character.
804 if (bufferLength.hasOverflowed())
805 return { };
806 Vector<char, urlBytesBufferLength> after(bufferLength.unsafeGet());
807
808 char* q = after.data();
809 {
810 const unsigned char* p = before;
811 for (int i = 0; i < length; i++) {
812 unsigned char c = p[i];
813 // unescape escape sequences that indicate bytes greater than 0x7f
814 if (c == '%' && i + 2 < length && isASCIIHexDigit(p[i + 1]) && isASCIIHexDigit(p[i + 2])) {
815 auto u = toASCIIHexValue(p[i + 1], p[i + 2]);
816 if (u > 0x7f) {
817 // unescape
818 *q++ = u;
819 } else {
820 // do not unescape
821 *q++ = p[i];
822 *q++ = p[i + 1];
823 *q++ = p[i + 2];
824 }
825 i += 2;
826 } else {
827 *q++ = c;
828
829 // Check for "xn--" in an efficient, non-case-sensitive, way.
830 if (c == '-' && i >= 3 && !mayNeedHostNameDecoding && (q[-4] | 0x20) == 'x' && (q[-3] | 0x20) == 'n' && q[-2] == '-')
831 mayNeedHostNameDecoding = true;
832 }
833 }
834 *q = '\0';
835 }
836
837 // Check string to see if it can be converted to display using UTF-8
838 String result = String::fromUTF8(after.data());
839 if (!result) {
840 // Could not convert to UTF-8.
841 // Convert characters greater than 0x7f to escape sequences.
842 // Shift current string to the end of the buffer
843 // then we will copy back bytes to the start of the buffer
844 // as we convert.
845 int afterlength = q - after.data();
846 char* p = after.data() + bufferLength.unsafeGet() - afterlength - 1;
847 memmove(p, after.data(), afterlength + 1); // copies trailing '\0'
848 char* q = after.data();
849 while (*p) {
850 unsigned char c = *p;
851 if (c > 0x7f) {
852 *q++ = '%';
853 *q++ = upperNibbleToASCIIHexDigit(c);
854 *q++ = lowerNibbleToASCIIHexDigit(c);
855 } else
856 *q++ = *p;
857 p++;
858 }
859 *q = '\0';
860 // Note: after.data() points to a null-terminated, pure ASCII string.
861 result = String::fromUTF8(after.data());
862 ASSERT(!!result);
863 }
864
865 // Note: result is UTF–16 string, created from either a valid UTF-8 string,
866 // or a pure ASCII string (where all bytes with the high bit set are
867 // percent-encoded).
868
869 if (mayNeedHostNameDecoding) {
870 // FIXME: Is it good to ignore the failure of mapHostNames and keep result intact?
871 auto mappedResult = mapHostNames(result, nullopt);
872 if (!!mappedResult)
873 result = mappedResult;
874 }
875
876 return escapeUnsafeCharacters(normalizedNFC(result));
877}
878
879} // namespace URLHelpers
880} // namespace WTF
881