1 | /* |
2 | * Copyright (C) 2005-2019 Apple Inc. All rights reserved. |
3 | * Copyright (C) 2018 Igalia S.L. |
4 | * |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions |
7 | * are met: |
8 | * |
9 | * 1. Redistributions of source code must retain the above copyright |
10 | * notice, this list of conditions and the following disclaimer. |
11 | * 2. Redistributions in binary form must reproduce the above copyright |
12 | * notice, this list of conditions and the following disclaimer in the |
13 | * documentation and/or other materials provided with the distribution. |
14 | * 3. Neither the name of Apple Inc. ("Apple") nor the names of |
15 | * its contributors may be used to endorse or promote products derived |
16 | * from this software without specific prior written permission. |
17 | * |
18 | * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY |
19 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
21 | * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
22 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
23 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
24 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
25 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
26 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
27 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 | */ |
29 | |
30 | #include "config.h" |
31 | #include "URLHelpers.h" |
32 | |
33 | #include "URLParser.h" |
34 | #include <mutex> |
35 | #include <unicode/uidna.h> |
36 | #include <unicode/uscript.h> |
37 | #include <wtf/Optional.h> |
38 | #include <wtf/text/WTFString.h> |
39 | |
40 | namespace WTF { |
41 | namespace URLHelpers { |
42 | |
43 | // Needs to be big enough to hold an IDN-encoded name. |
44 | // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK. |
45 | const unsigned hostNameBufferLength = 2048; |
46 | const unsigned urlBytesBufferLength = 2048; |
47 | |
48 | static uint32_t IDNScriptWhiteList[(USCRIPT_CODE_LIMIT + 31) / 32]; |
49 | |
50 | #if !PLATFORM(COCOA) |
51 | |
52 | // Cocoa has an implementation that uses a whitelist in /Library or ~/Library, |
53 | // if it exists. |
54 | void loadIDNScriptWhiteList() |
55 | { |
56 | static std::once_flag flag; |
57 | std::call_once(flag, initializeDefaultIDNScriptWhiteList); |
58 | } |
59 | |
60 | #endif // !PLATFORM(COCOA) |
61 | |
62 | static bool isArmenianLookalikeCharacter(UChar32 codePoint) |
63 | { |
64 | return codePoint == 0x0548 || codePoint == 0x054D || codePoint == 0x0578 || codePoint == 0x057D; |
65 | } |
66 | |
67 | static bool isArmenianScriptCharacter(UChar32 codePoint) |
68 | { |
69 | UErrorCode error = U_ZERO_ERROR; |
70 | UScriptCode script = uscript_getScript(codePoint, &error); |
71 | if (error != U_ZERO_ERROR) { |
72 | LOG_ERROR("got ICU error while trying to look at scripts: %d" , error); |
73 | return false; |
74 | } |
75 | |
76 | return script == USCRIPT_ARMENIAN; |
77 | } |
78 | |
79 | template<typename CharacterType> inline bool isASCIIDigitOrValidHostCharacter(CharacterType charCode) |
80 | { |
81 | if (!isASCIIDigitOrPunctuation(charCode)) |
82 | return false; |
83 | |
84 | // Things the URL Parser rejects: |
85 | switch (charCode) { |
86 | case '#': |
87 | case '%': |
88 | case '/': |
89 | case ':': |
90 | case '?': |
91 | case '@': |
92 | case '[': |
93 | case '\\': |
94 | case ']': |
95 | return false; |
96 | default: |
97 | return true; |
98 | } |
99 | } |
100 | |
101 | static bool isLookalikeCharacter(const Optional<UChar32>& previousCodePoint, UChar32 charCode) |
102 | { |
103 | // This function treats the following as unsafe, lookalike characters: |
104 | // any non-printable character, any character considered as whitespace, |
105 | // any ignorable character, and emoji characters related to locks. |
106 | |
107 | // We also considered the characters in Mozilla's blacklist <http://kb.mozillazine.org/Network.IDN.blacklist_chars>. |
108 | |
109 | // Some of the characters here will never appear once ICU has encoded. |
110 | // For example, ICU transforms most spaces into an ASCII space and most |
111 | // slashes into an ASCII solidus. But one of the two callers uses this |
112 | // on characters that have not been processed by ICU, so they are needed here. |
113 | |
114 | if (!u_isprint(charCode) || u_isUWhiteSpace(charCode) || u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) |
115 | return true; |
116 | |
117 | switch (charCode) { |
118 | case 0x00BC: /* VULGAR FRACTION ONE QUARTER */ |
119 | case 0x00BD: /* VULGAR FRACTION ONE HALF */ |
120 | case 0x00BE: /* VULGAR FRACTION THREE QUARTERS */ |
121 | case 0x00ED: /* LATIN SMALL LETTER I WITH ACUTE */ |
122 | /* 0x0131 LATIN SMALL LETTER DOTLESS I is intentionally not considered a lookalike character because it is visually distinguishable from i and it has legitimate use in the Turkish language. */ |
123 | case 0x01C0: /* LATIN LETTER DENTAL CLICK */ |
124 | case 0x01C3: /* LATIN LETTER RETROFLEX CLICK */ |
125 | case 0x0251: /* LATIN SMALL LETTER ALPHA */ |
126 | case 0x0261: /* LATIN SMALL LETTER SCRIPT G */ |
127 | case 0x027E: /* LATIN SMALL LETTER R WITH FISHHOOK */ |
128 | case 0x02D0: /* MODIFIER LETTER TRIANGULAR COLON */ |
129 | case 0x0335: /* COMBINING SHORT STROKE OVERLAY */ |
130 | case 0x0337: /* COMBINING SHORT SOLIDUS OVERLAY */ |
131 | case 0x0338: /* COMBINING LONG SOLIDUS OVERLAY */ |
132 | case 0x0589: /* ARMENIAN FULL STOP */ |
133 | case 0x05B4: /* HEBREW POINT HIRIQ */ |
134 | case 0x05BC: /* HEBREW POINT DAGESH OR MAPIQ */ |
135 | case 0x05C3: /* HEBREW PUNCTUATION SOF PASUQ */ |
136 | case 0x05F4: /* HEBREW PUNCTUATION GERSHAYIM */ |
137 | case 0x0609: /* ARABIC-INDIC PER MILLE SIGN */ |
138 | case 0x060A: /* ARABIC-INDIC PER TEN THOUSAND SIGN */ |
139 | case 0x0650: /* ARABIC KASRA */ |
140 | case 0x0660: /* ARABIC INDIC DIGIT ZERO */ |
141 | case 0x066A: /* ARABIC PERCENT SIGN */ |
142 | case 0x06D4: /* ARABIC FULL STOP */ |
143 | case 0x06F0: /* EXTENDED ARABIC INDIC DIGIT ZERO */ |
144 | case 0x0701: /* SYRIAC SUPRALINEAR FULL STOP */ |
145 | case 0x0702: /* SYRIAC SUBLINEAR FULL STOP */ |
146 | case 0x0703: /* SYRIAC SUPRALINEAR COLON */ |
147 | case 0x0704: /* SYRIAC SUBLINEAR COLON */ |
148 | case 0x1735: /* PHILIPPINE SINGLE PUNCTUATION */ |
149 | case 0x1D04: /* LATIN LETTER SMALL CAPITAL C */ |
150 | case 0x1D0F: /* LATIN LETTER SMALL CAPITAL O */ |
151 | case 0x1D1C: /* LATIN LETTER SMALL CAPITAL U */ |
152 | case 0x1D20: /* LATIN LETTER SMALL CAPITAL V */ |
153 | case 0x1D21: /* LATIN LETTER SMALL CAPITAL W */ |
154 | case 0x1D22: /* LATIN LETTER SMALL CAPITAL Z */ |
155 | case 0x1ECD: /* LATIN SMALL LETTER O WITH DOT BELOW */ |
156 | case 0x2010: /* HYPHEN */ |
157 | case 0x2011: /* NON-BREAKING HYPHEN */ |
158 | case 0x2024: /* ONE DOT LEADER */ |
159 | case 0x2027: /* HYPHENATION POINT */ |
160 | case 0x2039: /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ |
161 | case 0x203A: /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */ |
162 | case 0x2041: /* CARET INSERTION POINT */ |
163 | case 0x2044: /* FRACTION SLASH */ |
164 | case 0x2052: /* COMMERCIAL MINUS SIGN */ |
165 | case 0x2153: /* VULGAR FRACTION ONE THIRD */ |
166 | case 0x2154: /* VULGAR FRACTION TWO THIRDS */ |
167 | case 0x2155: /* VULGAR FRACTION ONE FIFTH */ |
168 | case 0x2156: /* VULGAR FRACTION TWO FIFTHS */ |
169 | case 0x2157: /* VULGAR FRACTION THREE FIFTHS */ |
170 | case 0x2158: /* VULGAR FRACTION FOUR FIFTHS */ |
171 | case 0x2159: /* VULGAR FRACTION ONE SIXTH */ |
172 | case 0x215A: /* VULGAR FRACTION FIVE SIXTHS */ |
173 | case 0x215B: /* VULGAR FRACTION ONE EIGHT */ |
174 | case 0x215C: /* VULGAR FRACTION THREE EIGHTHS */ |
175 | case 0x215D: /* VULGAR FRACTION FIVE EIGHTHS */ |
176 | case 0x215E: /* VULGAR FRACTION SEVEN EIGHTHS */ |
177 | case 0x215F: /* FRACTION NUMERATOR ONE */ |
178 | case 0x2212: /* MINUS SIGN */ |
179 | case 0x2215: /* DIVISION SLASH */ |
180 | case 0x2216: /* SET MINUS */ |
181 | case 0x2236: /* RATIO */ |
182 | case 0x233F: /* APL FUNCTIONAL SYMBOL SLASH BAR */ |
183 | case 0x23AE: /* INTEGRAL EXTENSION */ |
184 | case 0x244A: /* OCR DOUBLE BACKSLASH */ |
185 | case 0x2571: /* BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT */ |
186 | case 0x2572: /* BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT */ |
187 | case 0x29F6: /* SOLIDUS WITH OVERBAR */ |
188 | case 0x29F8: /* BIG SOLIDUS */ |
189 | case 0x2AFB: /* TRIPLE SOLIDUS BINARY RELATION */ |
190 | case 0x2AFD: /* DOUBLE SOLIDUS OPERATOR */ |
191 | case 0x2FF0: /* IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT */ |
192 | case 0x2FF1: /* IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO BELOW */ |
193 | case 0x2FF2: /* IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT */ |
194 | case 0x2FF3: /* IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW */ |
195 | case 0x2FF4: /* IDEOGRAPHIC DESCRIPTION CHARACTER FULL SURROUND */ |
196 | case 0x2FF5: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE */ |
197 | case 0x2FF6: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM BELOW */ |
198 | case 0x2FF7: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LEFT */ |
199 | case 0x2FF8: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER LEFT */ |
200 | case 0x2FF9: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER RIGHT */ |
201 | case 0x2FFA: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER LEFT */ |
202 | case 0x2FFB: /* IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID */ |
203 | case 0x3002: /* IDEOGRAPHIC FULL STOP */ |
204 | case 0x3008: /* LEFT ANGLE BRACKET */ |
205 | case 0x3014: /* LEFT TORTOISE SHELL BRACKET */ |
206 | case 0x3015: /* RIGHT TORTOISE SHELL BRACKET */ |
207 | case 0x3033: /* VERTICAL KANA REPEAT MARK UPPER HALF */ |
208 | case 0x3035: /* VERTICAL KANA REPEAT MARK LOWER HALF */ |
209 | case 0x321D: /* PARENTHESIZED KOREAN CHARACTER OJEON */ |
210 | case 0x321E: /* PARENTHESIZED KOREAN CHARACTER O HU */ |
211 | case 0x33AE: /* SQUARE RAD OVER S */ |
212 | case 0x33AF: /* SQUARE RAD OVER S SQUARED */ |
213 | case 0x33C6: /* SQUARE C OVER KG */ |
214 | case 0x33DF: /* SQUARE A OVER M */ |
215 | case 0x05B9: /* HEBREW POINT HOLAM */ |
216 | case 0x05BA: /* HEBREW POINT HOLAM HASER FOR VAV */ |
217 | case 0x05C1: /* HEBREW POINT SHIN DOT */ |
218 | case 0x05C2: /* HEBREW POINT SIN DOT */ |
219 | case 0x05C4: /* HEBREW MARK UPPER DOT */ |
220 | case 0xA731: /* LATIN LETTER SMALL CAPITAL S */ |
221 | case 0xA771: /* LATIN SMALL LETTER DUM */ |
222 | case 0xA789: /* MODIFIER LETTER COLON */ |
223 | case 0xFE14: /* PRESENTATION FORM FOR VERTICAL SEMICOLON */ |
224 | case 0xFE15: /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */ |
225 | case 0xFE3F: /* PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET */ |
226 | case 0xFE5D: /* SMALL LEFT TORTOISE SHELL BRACKET */ |
227 | case 0xFE5E: /* SMALL RIGHT TORTOISE SHELL BRACKET */ |
228 | case 0xFF0E: /* FULLWIDTH FULL STOP */ |
229 | case 0xFF0F: /* FULL WIDTH SOLIDUS */ |
230 | case 0xFF61: /* HALFWIDTH IDEOGRAPHIC FULL STOP */ |
231 | case 0xFFFC: /* OBJECT REPLACEMENT CHARACTER */ |
232 | case 0xFFFD: /* REPLACEMENT CHARACTER */ |
233 | case 0x1F50F: /* LOCK WITH INK PEN */ |
234 | case 0x1F510: /* CLOSED LOCK WITH KEY */ |
235 | case 0x1F511: /* KEY */ |
236 | case 0x1F512: /* LOCK */ |
237 | case 0x1F513: /* OPEN LOCK */ |
238 | return true; |
239 | case 0x0307: /* COMBINING DOT ABOVE */ |
240 | return previousCodePoint == 0x0237 /* LATIN SMALL LETTER DOTLESS J */ |
241 | || previousCodePoint == 0x0131 /* LATIN SMALL LETTER DOTLESS I */ |
242 | || previousCodePoint == 0x05D5; /* HEBREW LETTER VAV */ |
243 | case 0x0548: /* ARMENIAN CAPITAL LETTER VO */ |
244 | case 0x054D: /* ARMENIAN CAPITAL LETTER SEH */ |
245 | case 0x0578: /* ARMENIAN SMALL LETTER VO */ |
246 | case 0x057D: /* ARMENIAN SMALL LETTER SEH */ |
247 | return previousCodePoint |
248 | && !isASCIIDigitOrValidHostCharacter(previousCodePoint.value()) |
249 | && !isArmenianScriptCharacter(previousCodePoint.value()); |
250 | case '.': |
251 | return false; |
252 | default: |
253 | return previousCodePoint |
254 | && isArmenianLookalikeCharacter(previousCodePoint.value()) |
255 | && !(isArmenianScriptCharacter(charCode) || isASCIIDigitOrValidHostCharacter(charCode)); |
256 | } |
257 | } |
258 | |
259 | void whiteListIDNScript(const char* scriptName) |
260 | { |
261 | int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, scriptName); |
262 | if (script >= 0 && script < USCRIPT_CODE_LIMIT) { |
263 | size_t index = script / 32; |
264 | uint32_t mask = 1 << (script % 32); |
265 | IDNScriptWhiteList[index] |= mask; |
266 | } |
267 | } |
268 | |
269 | void initializeDefaultIDNScriptWhiteList() |
270 | { |
271 | const char* defaultIDNScriptWhiteList[20] = { |
272 | "Common" , |
273 | "Inherited" , |
274 | "Arabic" , |
275 | "Armenian" , |
276 | "Bopomofo" , |
277 | "Canadian_Aboriginal" , |
278 | "Devanagari" , |
279 | "Deseret" , |
280 | "Gujarati" , |
281 | "Gurmukhi" , |
282 | "Hangul" , |
283 | "Han" , |
284 | "Hebrew" , |
285 | "Hiragana" , |
286 | "Katakana_Or_Hiragana" , |
287 | "Katakana" , |
288 | "Latin" , |
289 | "Tamil" , |
290 | "Thai" , |
291 | "Yi" , |
292 | }; |
293 | for (const char* scriptName : defaultIDNScriptWhiteList) |
294 | whiteListIDNScript(scriptName); |
295 | } |
296 | |
297 | static bool allCharactersInIDNScriptWhiteList(const UChar* buffer, int32_t length) |
298 | { |
299 | loadIDNScriptWhiteList(); |
300 | int32_t i = 0; |
301 | Optional<UChar32> previousCodePoint; |
302 | while (i < length) { |
303 | UChar32 c; |
304 | U16_NEXT(buffer, i, length, c) |
305 | UErrorCode error = U_ZERO_ERROR; |
306 | UScriptCode script = uscript_getScript(c, &error); |
307 | if (error != U_ZERO_ERROR) { |
308 | LOG_ERROR("got ICU error while trying to look at scripts: %d" , error); |
309 | return false; |
310 | } |
311 | if (script < 0) { |
312 | LOG_ERROR("got negative number for script code from ICU: %d" , script); |
313 | return false; |
314 | } |
315 | if (script >= USCRIPT_CODE_LIMIT) |
316 | return false; |
317 | |
318 | size_t index = script / 32; |
319 | uint32_t mask = 1 << (script % 32); |
320 | if (!(IDNScriptWhiteList[index] & mask)) |
321 | return false; |
322 | |
323 | if (isLookalikeCharacter(previousCodePoint, c)) |
324 | return false; |
325 | previousCodePoint = c; |
326 | } |
327 | return true; |
328 | } |
329 | |
330 | static bool isSecondLevelDomainNameAllowedByTLDRules(const UChar* buffer, int32_t length, const WTF::Function<bool(UChar)>& characterIsAllowed) |
331 | { |
332 | ASSERT(length > 0); |
333 | |
334 | for (int32_t i = length - 1; i >= 0; --i) { |
335 | UChar ch = buffer[i]; |
336 | |
337 | if (characterIsAllowed(ch)) |
338 | continue; |
339 | |
340 | // Only check the second level domain. Lower level registrars may have different rules. |
341 | if (ch == '.') |
342 | break; |
343 | |
344 | return false; |
345 | } |
346 | return true; |
347 | } |
348 | |
349 | #define CHECK_RULES_IF_SUFFIX_MATCHES(suffix, function) \ |
350 | { \ |
351 | static const int32_t suffixLength = sizeof(suffix) / sizeof(suffix[0]); \ |
352 | if (length > suffixLength && !memcmp(buffer + length - suffixLength, suffix, sizeof(suffix))) \ |
353 | return isSecondLevelDomainNameAllowedByTLDRules(buffer, length - suffixLength, function); \ |
354 | } |
355 | |
356 | static bool isRussianDomainNameCharacter(UChar ch) |
357 | { |
358 | // Only modern Russian letters, digits and dashes are allowed. |
359 | return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || isASCIIDigit(ch) || ch == '-'; |
360 | } |
361 | |
362 | static bool allCharactersAllowedByTLDRules(const UChar* buffer, int32_t length) |
363 | { |
364 | // Skip trailing dot for root domain. |
365 | if (buffer[length - 1] == '.') |
366 | length--; |
367 | |
368 | // http://cctld.ru/files/pdf/docs/rules_ru-rf.pdf |
369 | static const UChar cyrillicRF[] = { |
370 | '.', |
371 | 0x0440, // CYRILLIC SMALL LETTER ER |
372 | 0x0444, // CYRILLIC SMALL LETTER EF |
373 | }; |
374 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRF, isRussianDomainNameCharacter); |
375 | |
376 | // http://rusnames.ru/rules.pl |
377 | static const UChar cyrillicRUS[] = { |
378 | '.', |
379 | 0x0440, // CYRILLIC SMALL LETTER ER |
380 | 0x0443, // CYRILLIC SMALL LETTER U |
381 | 0x0441, // CYRILLIC SMALL LETTER ES |
382 | }; |
383 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRUS, isRussianDomainNameCharacter); |
384 | |
385 | // http://ru.faitid.org/projects/moscow/documents/moskva/idn |
386 | static const UChar cyrillicMOSKVA[] = { |
387 | '.', |
388 | 0x043C, // CYRILLIC SMALL LETTER EM |
389 | 0x043E, // CYRILLIC SMALL LETTER O |
390 | 0x0441, // CYRILLIC SMALL LETTER ES |
391 | 0x043A, // CYRILLIC SMALL LETTER KA |
392 | 0x0432, // CYRILLIC SMALL LETTER VE |
393 | 0x0430, // CYRILLIC SMALL LETTER A |
394 | }; |
395 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMOSKVA, isRussianDomainNameCharacter); |
396 | |
397 | // http://www.dotdeti.ru/foruser/docs/regrules.php |
398 | static const UChar cyrillicDETI[] = { |
399 | '.', |
400 | 0x0434, // CYRILLIC SMALL LETTER DE |
401 | 0x0435, // CYRILLIC SMALL LETTER IE |
402 | 0x0442, // CYRILLIC SMALL LETTER TE |
403 | 0x0438, // CYRILLIC SMALL LETTER I |
404 | }; |
405 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicDETI, isRussianDomainNameCharacter); |
406 | |
407 | // http://corenic.org - rules not published. The word is Russian, so only allowing Russian at this time, |
408 | // although we may need to revise the checks if this ends up being used with other languages spoken in Russia. |
409 | static const UChar cyrillicONLAYN[] = { |
410 | '.', |
411 | 0x043E, // CYRILLIC SMALL LETTER O |
412 | 0x043D, // CYRILLIC SMALL LETTER EN |
413 | 0x043B, // CYRILLIC SMALL LETTER EL |
414 | 0x0430, // CYRILLIC SMALL LETTER A |
415 | 0x0439, // CYRILLIC SMALL LETTER SHORT I |
416 | 0x043D, // CYRILLIC SMALL LETTER EN |
417 | }; |
418 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicONLAYN, isRussianDomainNameCharacter); |
419 | |
420 | // http://corenic.org - same as above. |
421 | static const UChar cyrillicSAYT[] = { |
422 | '.', |
423 | 0x0441, // CYRILLIC SMALL LETTER ES |
424 | 0x0430, // CYRILLIC SMALL LETTER A |
425 | 0x0439, // CYRILLIC SMALL LETTER SHORT I |
426 | 0x0442, // CYRILLIC SMALL LETTER TE |
427 | }; |
428 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSAYT, isRussianDomainNameCharacter); |
429 | |
430 | // http://pir.org/products/opr-domain/ - rules not published. According to the registry site, |
431 | // the intended audience is "Russian and other Slavic-speaking markets". |
432 | // Chrome appears to only allow Russian, so sticking with that for now. |
433 | static const UChar cyrillicORG[] = { |
434 | '.', |
435 | 0x043E, // CYRILLIC SMALL LETTER O |
436 | 0x0440, // CYRILLIC SMALL LETTER ER |
437 | 0x0433, // CYRILLIC SMALL LETTER GHE |
438 | }; |
439 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicORG, isRussianDomainNameCharacter); |
440 | |
441 | // http://cctld.by/rules.html |
442 | static const UChar cyrillicBEL[] = { |
443 | '.', |
444 | 0x0431, // CYRILLIC SMALL LETTER BE |
445 | 0x0435, // CYRILLIC SMALL LETTER IE |
446 | 0x043B, // CYRILLIC SMALL LETTER EL |
447 | }; |
448 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBEL, [](UChar ch) { |
449 | // Russian and Byelorussian letters, digits and dashes are allowed. |
450 | return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x0456 || ch == 0x045E || ch == 0x2019 || isASCIIDigit(ch) || ch == '-'; |
451 | }); |
452 | |
453 | // http://www.nic.kz/docs/poryadok_vnedreniya_kaz_ru.pdf |
454 | static const UChar cyrillicKAZ[] = { |
455 | '.', |
456 | 0x049B, // CYRILLIC SMALL LETTER KA WITH DESCENDER |
457 | 0x0430, // CYRILLIC SMALL LETTER A |
458 | 0x0437, // CYRILLIC SMALL LETTER ZE |
459 | }; |
460 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicKAZ, [](UChar ch) { |
461 | // Kazakh letters, digits and dashes are allowed. |
462 | return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x04D9 || ch == 0x0493 || ch == 0x049B || ch == 0x04A3 || ch == 0x04E9 || ch == 0x04B1 || ch == 0x04AF || ch == 0x04BB || ch == 0x0456 || isASCIIDigit(ch) || ch == '-'; |
463 | }); |
464 | |
465 | // http://uanic.net/docs/documents-ukr/Rules%20of%20UKR_v4.0.pdf |
466 | static const UChar cyrillicUKR[] = { |
467 | '.', |
468 | 0x0443, // CYRILLIC SMALL LETTER U |
469 | 0x043A, // CYRILLIC SMALL LETTER KA |
470 | 0x0440, // CYRILLIC SMALL LETTER ER |
471 | }; |
472 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicUKR, [](UChar ch) { |
473 | // Russian and Ukrainian letters, digits and dashes are allowed. |
474 | return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x0491 || ch == 0x0404 || ch == 0x0456 || ch == 0x0457 || isASCIIDigit(ch) || ch == '-'; |
475 | }); |
476 | |
477 | // http://www.rnids.rs/data/DOKUMENTI/idn-srb-policy-termsofuse-v1.4-eng.pdf |
478 | static const UChar cyrillicSRB[] = { |
479 | '.', |
480 | 0x0441, // CYRILLIC SMALL LETTER ES |
481 | 0x0440, // CYRILLIC SMALL LETTER ER |
482 | 0x0431, // CYRILLIC SMALL LETTER BE |
483 | }; |
484 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSRB, [](UChar ch) { |
485 | // Serbian letters, digits and dashes are allowed. |
486 | return (ch >= 0x0430 && ch <= 0x0438) || (ch >= 0x043A && ch <= 0x0448) || ch == 0x0452 || ch == 0x0458 || ch == 0x0459 || ch == 0x045A || ch == 0x045B || ch == 0x045F || isASCIIDigit(ch) || ch == '-'; |
487 | }); |
488 | |
489 | // http://marnet.mk/doc/pravilnik-mk-mkd.pdf |
490 | static const UChar cyrillicMKD[] = { |
491 | '.', |
492 | 0x043C, // CYRILLIC SMALL LETTER EM |
493 | 0x043A, // CYRILLIC SMALL LETTER KA |
494 | 0x0434, // CYRILLIC SMALL LETTER DE |
495 | }; |
496 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMKD, [](UChar ch) { |
497 | // Macedonian letters, digits and dashes are allowed. |
498 | return (ch >= 0x0430 && ch <= 0x0438) || (ch >= 0x043A && ch <= 0x0448) || ch == 0x0453 || ch == 0x0455 || ch == 0x0458 || ch == 0x0459 || ch == 0x045A || ch == 0x045C || ch == 0x045F || isASCIIDigit(ch) || ch == '-'; |
499 | }); |
500 | |
501 | // https://www.mon.mn/cs/ |
502 | static const UChar cyrillicMON[] = { |
503 | '.', |
504 | 0x043C, // CYRILLIC SMALL LETTER EM |
505 | 0x043E, // CYRILLIC SMALL LETTER O |
506 | 0x043D, // CYRILLIC SMALL LETTER EN |
507 | }; |
508 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMON, [](UChar ch) { |
509 | // Mongolian letters, digits and dashes are allowed. |
510 | return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x04E9 || ch == 0x04AF || isASCIIDigit(ch) || ch == '-'; |
511 | }); |
512 | |
513 | // https://www.icann.org/sites/default/files/packages/lgr/lgr-second-level-bulgarian-30aug16-en.html |
514 | static const UChar cyrillicBG[] = { |
515 | '.', |
516 | 0x0431, // CYRILLIC SMALL LETTER BE |
517 | 0x0433 // CYRILLIC SMALL LETTER GHE |
518 | }; |
519 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBG, [](UChar ch) { |
520 | return (ch >= 0x0430 && ch <= 0x044A) || ch == 0x044C || (ch >= 0x044E && ch <= 0x0450) || ch == 0x045D || isASCIIDigit(ch) || ch == '-'; |
521 | }); |
522 | |
523 | // Not a known top level domain with special rules. |
524 | return false; |
525 | } |
526 | |
527 | // Return value of null means no mapping is necessary. |
528 | Optional<String> mapHostName(const String& hostName, const Optional<URLDecodeFunction>& decodeFunction) |
529 | { |
530 | if (hostName.length() > hostNameBufferLength) |
531 | return String(); |
532 | |
533 | if (!hostName.length()) |
534 | return String(); |
535 | |
536 | String string; |
537 | if (decodeFunction && string.contains('%')) |
538 | string = (*decodeFunction)(hostName); |
539 | else |
540 | string = hostName; |
541 | |
542 | unsigned length = string.length(); |
543 | |
544 | auto sourceBuffer = string.charactersWithNullTermination(); |
545 | |
546 | UChar destinationBuffer[hostNameBufferLength]; |
547 | UErrorCode uerror = U_ZERO_ERROR; |
548 | UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER; |
549 | int32_t numCharactersConverted = (decodeFunction ? uidna_nameToASCII : uidna_nameToUnicode)(&URLParser::internationalDomainNameTranscoder(), sourceBuffer.data(), length, destinationBuffer, hostNameBufferLength, &processingDetails, &uerror); |
550 | if (length && (U_FAILURE(uerror) || processingDetails.errors)) |
551 | return nullopt; |
552 | |
553 | if (numCharactersConverted == static_cast<int32_t>(length) && !memcmp(sourceBuffer.data(), destinationBuffer, length * sizeof(UChar))) |
554 | return String(); |
555 | |
556 | if (!decodeFunction && !allCharactersInIDNScriptWhiteList(destinationBuffer, numCharactersConverted) && !allCharactersAllowedByTLDRules(destinationBuffer, numCharactersConverted)) |
557 | return String(); |
558 | |
559 | return String(destinationBuffer, numCharactersConverted); |
560 | } |
561 | |
562 | using MappingRangesVector = Optional<Vector<std::tuple<unsigned, unsigned, String>>>; |
563 | |
564 | static void collectRangesThatNeedMapping(const String& string, unsigned location, unsigned length, MappingRangesVector& array, const Optional<URLDecodeFunction>& decodeFunction) |
565 | { |
566 | // Generally, we want to optimize for the case where there is one host name that does not need mapping. |
567 | // Therefore, we use null to indicate no mapping here and an empty array to indicate error. |
568 | |
569 | String substring = string.substringSharingImpl(location, length); |
570 | Optional<String> host = mapHostName(substring, decodeFunction); |
571 | |
572 | if (host && !*host) |
573 | return; |
574 | |
575 | if (!array) |
576 | array = Vector<std::tuple<unsigned, unsigned, String>>(); |
577 | |
578 | if (host) |
579 | array->constructAndAppend(location, length, *host); |
580 | } |
581 | |
582 | static void applyHostNameFunctionToMailToURLString(const String& string, const Optional<URLDecodeFunction>& decodeFunction, MappingRangesVector& array) |
583 | { |
584 | // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' character. |
585 | // Skip quoted strings so that characters in them don't confuse us. |
586 | // When we find a '?' character, we are past the part of the URL that contains host names. |
587 | |
588 | unsigned stringLength = string.length(); |
589 | unsigned current = 0; |
590 | |
591 | while (1) { |
592 | // Find start of host name or of quoted string. |
593 | auto hostNameOrStringStart = string.find([](UChar ch) { |
594 | return ch == '"' || ch == '@' || ch == '?'; |
595 | }, current); |
596 | if (hostNameOrStringStart == notFound) |
597 | return; |
598 | |
599 | UChar c = string[hostNameOrStringStart]; |
600 | current = hostNameOrStringStart + 1; |
601 | |
602 | if (c == '?') |
603 | return; |
604 | |
605 | if (c == '@') { |
606 | // Find end of host name. |
607 | unsigned hostNameStart = current; |
608 | auto hostNameEnd = string.find([](UChar ch) { |
609 | return ch == '>' || ch == ',' || ch == '?'; |
610 | }, current); |
611 | |
612 | bool done; |
613 | if (hostNameEnd == notFound) { |
614 | hostNameEnd = stringLength; |
615 | done = true; |
616 | } else { |
617 | current = hostNameEnd; |
618 | done = false; |
619 | } |
620 | |
621 | // Process host name range. |
622 | collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction); |
623 | |
624 | if (done) |
625 | return; |
626 | } else { |
627 | // Skip quoted string. |
628 | ASSERT(c == '"'); |
629 | while (1) { |
630 | auto escapedCharacterOrStringEnd = string.find([](UChar ch) { |
631 | return ch == '"' || ch == '\\'; |
632 | }, current); |
633 | if (escapedCharacterOrStringEnd == notFound) |
634 | return; |
635 | |
636 | c = string[escapedCharacterOrStringEnd]; |
637 | current = escapedCharacterOrStringEnd + 1; |
638 | |
639 | // If we are the end of the string, then break from the string loop back to the host name loop. |
640 | if (c == '"') |
641 | break; |
642 | |
643 | // Skip escaped character. |
644 | ASSERT(c == '\\'); |
645 | if (current == stringLength) |
646 | return; |
647 | |
648 | ++current; |
649 | } |
650 | } |
651 | } |
652 | } |
653 | |
654 | static void applyHostNameFunctionToURLString(const String& string, const Optional<URLDecodeFunction>& decodeFunction, MappingRangesVector& array) |
655 | { |
656 | // Find hostnames. Too bad we can't use any real URL-parsing code to do this, |
657 | // but we have to do it before doing all the %-escaping, and this is the only |
658 | // code we have that parses mailto URLs anyway. |
659 | |
660 | // Maybe we should implement this using a character buffer instead? |
661 | |
662 | if (protocolIs(string, "mailto" )) { |
663 | applyHostNameFunctionToMailToURLString(string, decodeFunction, array); |
664 | return; |
665 | } |
666 | |
667 | // Find the host name in a hierarchical URL. |
668 | // It comes after a "://" sequence, with scheme characters preceding. |
669 | // If ends with the end of the string or a ":", "/", or a "?". |
670 | // If there is a "@" character, the host part is just the part after the "@". |
671 | static const char* separator = "://" ; |
672 | auto separatorIndex = string.find(separator); |
673 | if (separatorIndex == notFound) |
674 | return; |
675 | |
676 | unsigned authorityStart = separatorIndex + strlen(separator); |
677 | |
678 | // Check that all characters before the :// are valid scheme characters. |
679 | auto invalidSchemeCharacter = string.substringSharingImpl(0, separatorIndex).find([](UChar ch) { |
680 | static const char* allowedCharacters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-." ; |
681 | static size_t length = strlen(allowedCharacters); |
682 | for (size_t i = 0; i < length; ++i) { |
683 | if (allowedCharacters[i] == ch) |
684 | return false; |
685 | } |
686 | return true; |
687 | }); |
688 | |
689 | if (invalidSchemeCharacter != notFound) |
690 | return; |
691 | |
692 | unsigned stringLength = string.length(); |
693 | |
694 | // Find terminating character. |
695 | auto hostNameTerminator = string.find([](UChar ch) { |
696 | static const char* terminatingCharacters = ":/?#" ; |
697 | static size_t length = strlen(terminatingCharacters); |
698 | for (size_t i = 0; i < length; ++i) { |
699 | if (terminatingCharacters[i] == ch) |
700 | return true; |
701 | } |
702 | return false; |
703 | }, authorityStart); |
704 | unsigned hostNameEnd = hostNameTerminator == notFound ? stringLength : hostNameTerminator; |
705 | |
706 | // Find "@" for the start of the host name. |
707 | auto userInfoTerminator = string.substringSharingImpl(0, hostNameEnd).find('@', authorityStart); |
708 | unsigned hostNameStart = userInfoTerminator == notFound ? authorityStart : userInfoTerminator + 1; |
709 | |
710 | collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction); |
711 | } |
712 | |
713 | String mapHostNames(const String& string, const Optional<URLDecodeFunction>& decodeFunction) |
714 | { |
715 | // Generally, we want to optimize for the case where there is one host name that does not need mapping. |
716 | |
717 | if (decodeFunction && string.isAllASCII()) |
718 | return string; |
719 | |
720 | // Make a list of ranges that actually need mapping. |
721 | MappingRangesVector hostNameRanges; |
722 | applyHostNameFunctionToURLString(string, decodeFunction, hostNameRanges); |
723 | if (!hostNameRanges) |
724 | return string; |
725 | |
726 | if (hostNameRanges->isEmpty()) |
727 | return { }; |
728 | |
729 | // Do the mapping. |
730 | String result = string; |
731 | while (!hostNameRanges->isEmpty()) { |
732 | unsigned location, length; |
733 | String mappedHostName; |
734 | std::tie(location, length, mappedHostName) = hostNameRanges->takeLast(); |
735 | result = result.replace(location, length, mappedHostName); |
736 | } |
737 | return result; |
738 | } |
739 | |
740 | static String escapeUnsafeCharacters(const String& sourceBuffer) |
741 | { |
742 | unsigned length = sourceBuffer.length(); |
743 | |
744 | Optional<UChar32> previousCodePoint; |
745 | |
746 | unsigned i; |
747 | for (i = 0; i < length; ) { |
748 | UChar32 c = sourceBuffer.characterStartingAt(i); |
749 | if (isLookalikeCharacter(previousCodePoint, sourceBuffer.characterStartingAt(i))) |
750 | break; |
751 | previousCodePoint = c; |
752 | i += U16_LENGTH(c); |
753 | } |
754 | |
755 | if (i == length) |
756 | return sourceBuffer; |
757 | |
758 | Vector<UChar, urlBytesBufferLength> outBuffer; |
759 | |
760 | outBuffer.grow(i); |
761 | if (sourceBuffer.is8Bit()) |
762 | StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters8(), i); |
763 | else |
764 | StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters16(), i); |
765 | |
766 | for (; i < length; ) { |
767 | UChar32 c = sourceBuffer.characterStartingAt(i); |
768 | unsigned characterLength = U16_LENGTH(c); |
769 | if (isLookalikeCharacter(previousCodePoint, c)) { |
770 | uint8_t utf8Buffer[4]; |
771 | size_t offset = 0; |
772 | UBool failure = false; |
773 | U8_APPEND(utf8Buffer, offset, 4, c, failure); |
774 | ASSERT_UNUSED(failure, !failure); |
775 | |
776 | for (size_t j = 0; j < offset; ++j) { |
777 | outBuffer.append('%'); |
778 | outBuffer.append(upperNibbleToASCIIHexDigit(utf8Buffer[j])); |
779 | outBuffer.append(lowerNibbleToASCIIHexDigit(utf8Buffer[j])); |
780 | } |
781 | } else { |
782 | for (unsigned j = 0; j < characterLength; ++j) |
783 | outBuffer.append(sourceBuffer[i + j]); |
784 | } |
785 | previousCodePoint = c; |
786 | i += characterLength; |
787 | } |
788 | |
789 | return String::adopt(WTFMove(outBuffer)); |
790 | } |
791 | |
792 | String userVisibleURL(const CString& url) |
793 | { |
794 | auto* before = reinterpret_cast<const unsigned char*>(url.data()); |
795 | int length = url.length(); |
796 | |
797 | if (!length) |
798 | return { }; |
799 | |
800 | bool mayNeedHostNameDecoding = false; |
801 | |
802 | Checked<int, RecordOverflow> bufferLength = length; |
803 | bufferLength = bufferLength * 3 + 1; // The buffer should be large enough to %-escape every character. |
804 | if (bufferLength.hasOverflowed()) |
805 | return { }; |
806 | Vector<char, urlBytesBufferLength> after(bufferLength.unsafeGet()); |
807 | |
808 | char* q = after.data(); |
809 | { |
810 | const unsigned char* p = before; |
811 | for (int i = 0; i < length; i++) { |
812 | unsigned char c = p[i]; |
813 | // unescape escape sequences that indicate bytes greater than 0x7f |
814 | if (c == '%' && i + 2 < length && isASCIIHexDigit(p[i + 1]) && isASCIIHexDigit(p[i + 2])) { |
815 | auto u = toASCIIHexValue(p[i + 1], p[i + 2]); |
816 | if (u > 0x7f) { |
817 | // unescape |
818 | *q++ = u; |
819 | } else { |
820 | // do not unescape |
821 | *q++ = p[i]; |
822 | *q++ = p[i + 1]; |
823 | *q++ = p[i + 2]; |
824 | } |
825 | i += 2; |
826 | } else { |
827 | *q++ = c; |
828 | |
829 | // Check for "xn--" in an efficient, non-case-sensitive, way. |
830 | if (c == '-' && i >= 3 && !mayNeedHostNameDecoding && (q[-4] | 0x20) == 'x' && (q[-3] | 0x20) == 'n' && q[-2] == '-') |
831 | mayNeedHostNameDecoding = true; |
832 | } |
833 | } |
834 | *q = '\0'; |
835 | } |
836 | |
837 | // Check string to see if it can be converted to display using UTF-8 |
838 | String result = String::fromUTF8(after.data()); |
839 | if (!result) { |
840 | // Could not convert to UTF-8. |
841 | // Convert characters greater than 0x7f to escape sequences. |
842 | // Shift current string to the end of the buffer |
843 | // then we will copy back bytes to the start of the buffer |
844 | // as we convert. |
845 | int afterlength = q - after.data(); |
846 | char* p = after.data() + bufferLength.unsafeGet() - afterlength - 1; |
847 | memmove(p, after.data(), afterlength + 1); // copies trailing '\0' |
848 | char* q = after.data(); |
849 | while (*p) { |
850 | unsigned char c = *p; |
851 | if (c > 0x7f) { |
852 | *q++ = '%'; |
853 | *q++ = upperNibbleToASCIIHexDigit(c); |
854 | *q++ = lowerNibbleToASCIIHexDigit(c); |
855 | } else |
856 | *q++ = *p; |
857 | p++; |
858 | } |
859 | *q = '\0'; |
860 | // Note: after.data() points to a null-terminated, pure ASCII string. |
861 | result = String::fromUTF8(after.data()); |
862 | ASSERT(!!result); |
863 | } |
864 | |
865 | // Note: result is UTF–16 string, created from either a valid UTF-8 string, |
866 | // or a pure ASCII string (where all bytes with the high bit set are |
867 | // percent-encoded). |
868 | |
869 | if (mayNeedHostNameDecoding) { |
870 | // FIXME: Is it good to ignore the failure of mapHostNames and keep result intact? |
871 | auto mappedResult = mapHostNames(result, nullopt); |
872 | if (!!mappedResult) |
873 | result = mappedResult; |
874 | } |
875 | |
876 | return escapeUnsafeCharacters(normalizedNFC(result)); |
877 | } |
878 | |
879 | } // namespace URLHelpers |
880 | } // namespace WTF |
881 | |