1 | /* |
2 | * Copyright (C) 2005-2019 Apple Inc. All rights reserved. |
3 | * Copyright (C) 2018 Igalia S.L. |
4 | * |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions |
7 | * are met: |
8 | * |
9 | * 1. Redistributions of source code must retain the above copyright |
10 | * notice, this list of conditions and the following disclaimer. |
11 | * 2. Redistributions in binary form must reproduce the above copyright |
12 | * notice, this list of conditions and the following disclaimer in the |
13 | * documentation and/or other materials provided with the distribution. |
14 | * 3. Neither the name of Apple Inc. ("Apple") nor the names of |
15 | * its contributors may be used to endorse or promote products derived |
16 | * from this software without specific prior written permission. |
17 | * |
18 | * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY |
19 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
21 | * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
22 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
23 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
24 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
25 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
26 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
27 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 | */ |
29 | |
30 | #include "config.h" |
31 | #include "URLHelpers.h" |
32 | |
33 | #include "URLParser.h" |
34 | #include <mutex> |
35 | #include <unicode/uidna.h> |
36 | #include <unicode/uscript.h> |
37 | #include <wtf/Optional.h> |
38 | #include <wtf/text/WTFString.h> |
39 | |
40 | namespace WTF { |
41 | namespace URLHelpers { |
42 | |
43 | // Needs to be big enough to hold an IDN-encoded name. |
44 | // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK. |
45 | const unsigned hostNameBufferLength = 2048; |
46 | const unsigned urlBytesBufferLength = 2048; |
47 | |
48 | static uint32_t IDNScriptWhiteList[(USCRIPT_CODE_LIMIT + 31) / 32]; |
49 | |
50 | #if !PLATFORM(COCOA) |
51 | |
52 | // Cocoa has an implementation that uses a whitelist in /Library or ~/Library, |
53 | // if it exists. |
54 | void loadIDNScriptWhiteList() |
55 | { |
56 | static std::once_flag flag; |
57 | std::call_once(flag, initializeDefaultIDNScriptWhiteList); |
58 | } |
59 | |
60 | #endif // !PLATFORM(COCOA) |
61 | |
62 | static bool isArmenianLookalikeCharacter(UChar32 codePoint) |
63 | { |
64 | return codePoint == 0x0548 || codePoint == 0x054D || codePoint == 0x0578 || codePoint == 0x057D; |
65 | } |
66 | |
67 | static bool isArmenianScriptCharacter(UChar32 codePoint) |
68 | { |
69 | UErrorCode error = U_ZERO_ERROR; |
70 | UScriptCode script = uscript_getScript(codePoint, &error); |
71 | if (error != U_ZERO_ERROR) { |
72 | LOG_ERROR("got ICU error while trying to look at scripts: %d" , error); |
73 | return false; |
74 | } |
75 | |
76 | return script == USCRIPT_ARMENIAN; |
77 | } |
78 | |
79 | template<typename CharacterType> inline bool isASCIIDigitOrValidHostCharacter(CharacterType charCode) |
80 | { |
81 | if (!isASCIIDigitOrPunctuation(charCode)) |
82 | return false; |
83 | |
84 | // Things the URL Parser rejects: |
85 | switch (charCode) { |
86 | case '#': |
87 | case '%': |
88 | case '/': |
89 | case ':': |
90 | case '?': |
91 | case '@': |
92 | case '[': |
93 | case '\\': |
94 | case ']': |
95 | return false; |
96 | default: |
97 | return true; |
98 | } |
99 | } |
100 | |
101 | static bool isLookalikeCharacter(const Optional<UChar32>& previousCodePoint, UChar32 charCode) |
102 | { |
103 | // This function treats the following as unsafe, lookalike characters: |
104 | // any non-printable character, any character considered as whitespace, |
105 | // any ignorable character, and emoji characters related to locks. |
106 | |
107 | // We also considered the characters in Mozilla's blacklist <http://kb.mozillazine.org/Network.IDN.blacklist_chars>. |
108 | |
109 | // Some of the characters here will never appear once ICU has encoded. |
110 | // For example, ICU transforms most spaces into an ASCII space and most |
111 | // slashes into an ASCII solidus. But one of the two callers uses this |
112 | // on characters that have not been processed by ICU, so they are needed here. |
113 | |
114 | if (!u_isprint(charCode) || u_isUWhiteSpace(charCode) || u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) |
115 | return true; |
116 | |
117 | switch (charCode) { |
118 | case 0x00BC: /* VULGAR FRACTION ONE QUARTER */ |
119 | case 0x00BD: /* VULGAR FRACTION ONE HALF */ |
120 | case 0x00BE: /* VULGAR FRACTION THREE QUARTERS */ |
121 | case 0x00ED: /* LATIN SMALL LETTER I WITH ACUTE */ |
122 | /* 0x0131 LATIN SMALL LETTER DOTLESS I is intentionally not considered a lookalike character because it is visually distinguishable from i and it has legitimate use in the Turkish language. */ |
123 | case 0x01C0: /* LATIN LETTER DENTAL CLICK */ |
124 | case 0x01C3: /* LATIN LETTER RETROFLEX CLICK */ |
125 | case 0x0251: /* LATIN SMALL LETTER ALPHA */ |
126 | case 0x0261: /* LATIN SMALL LETTER SCRIPT G */ |
127 | case 0x027E: /* LATIN SMALL LETTER R WITH FISHHOOK */ |
128 | case 0x02D0: /* MODIFIER LETTER TRIANGULAR COLON */ |
129 | case 0x0335: /* COMBINING SHORT STROKE OVERLAY */ |
130 | case 0x0337: /* COMBINING SHORT SOLIDUS OVERLAY */ |
131 | case 0x0338: /* COMBINING LONG SOLIDUS OVERLAY */ |
132 | case 0x0589: /* ARMENIAN FULL STOP */ |
133 | case 0x05B4: /* HEBREW POINT HIRIQ */ |
134 | case 0x05BC: /* HEBREW POINT DAGESH OR MAPIQ */ |
135 | case 0x05C3: /* HEBREW PUNCTUATION SOF PASUQ */ |
136 | case 0x05F4: /* HEBREW PUNCTUATION GERSHAYIM */ |
137 | case 0x0609: /* ARABIC-INDIC PER MILLE SIGN */ |
138 | case 0x060A: /* ARABIC-INDIC PER TEN THOUSAND SIGN */ |
139 | case 0x0650: /* ARABIC KASRA */ |
140 | case 0x0660: /* ARABIC INDIC DIGIT ZERO */ |
141 | case 0x066A: /* ARABIC PERCENT SIGN */ |
142 | case 0x06D4: /* ARABIC FULL STOP */ |
143 | case 0x06F0: /* EXTENDED ARABIC INDIC DIGIT ZERO */ |
144 | case 0x0701: /* SYRIAC SUPRALINEAR FULL STOP */ |
145 | case 0x0702: /* SYRIAC SUBLINEAR FULL STOP */ |
146 | case 0x0703: /* SYRIAC SUPRALINEAR COLON */ |
147 | case 0x0704: /* SYRIAC SUBLINEAR COLON */ |
148 | case 0x1735: /* PHILIPPINE SINGLE PUNCTUATION */ |
149 | case 0x1D04: /* LATIN LETTER SMALL CAPITAL C */ |
150 | case 0x1D0F: /* LATIN LETTER SMALL CAPITAL O */ |
151 | case 0x1D1C: /* LATIN LETTER SMALL CAPITAL U */ |
152 | case 0x1D20: /* LATIN LETTER SMALL CAPITAL V */ |
153 | case 0x1D21: /* LATIN LETTER SMALL CAPITAL W */ |
154 | case 0x1D22: /* LATIN LETTER SMALL CAPITAL Z */ |
155 | case 0x1ECD: /* LATIN SMALL LETTER O WITH DOT BELOW */ |
156 | case 0x2010: /* HYPHEN */ |
157 | case 0x2011: /* NON-BREAKING HYPHEN */ |
158 | case 0x2024: /* ONE DOT LEADER */ |
159 | case 0x2027: /* HYPHENATION POINT */ |
160 | case 0x2039: /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ |
161 | case 0x203A: /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */ |
162 | case 0x2041: /* CARET INSERTION POINT */ |
163 | case 0x2044: /* FRACTION SLASH */ |
164 | case 0x2052: /* COMMERCIAL MINUS SIGN */ |
165 | case 0x2153: /* VULGAR FRACTION ONE THIRD */ |
166 | case 0x2154: /* VULGAR FRACTION TWO THIRDS */ |
167 | case 0x2155: /* VULGAR FRACTION ONE FIFTH */ |
168 | case 0x2156: /* VULGAR FRACTION TWO FIFTHS */ |
169 | case 0x2157: /* VULGAR FRACTION THREE FIFTHS */ |
170 | case 0x2158: /* VULGAR FRACTION FOUR FIFTHS */ |
171 | case 0x2159: /* VULGAR FRACTION ONE SIXTH */ |
172 | case 0x215A: /* VULGAR FRACTION FIVE SIXTHS */ |
173 | case 0x215B: /* VULGAR FRACTION ONE EIGHT */ |
174 | case 0x215C: /* VULGAR FRACTION THREE EIGHTHS */ |
175 | case 0x215D: /* VULGAR FRACTION FIVE EIGHTHS */ |
176 | case 0x215E: /* VULGAR FRACTION SEVEN EIGHTHS */ |
177 | case 0x215F: /* FRACTION NUMERATOR ONE */ |
178 | case 0x2212: /* MINUS SIGN */ |
179 | case 0x2215: /* DIVISION SLASH */ |
180 | case 0x2216: /* SET MINUS */ |
181 | case 0x2236: /* RATIO */ |
182 | case 0x233F: /* APL FUNCTIONAL SYMBOL SLASH BAR */ |
183 | case 0x23AE: /* INTEGRAL EXTENSION */ |
184 | case 0x244A: /* OCR DOUBLE BACKSLASH */ |
185 | case 0x2571: /* BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT */ |
186 | case 0x2572: /* BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT */ |
187 | case 0x29F6: /* SOLIDUS WITH OVERBAR */ |
188 | case 0x29F8: /* BIG SOLIDUS */ |
189 | case 0x2AFB: /* TRIPLE SOLIDUS BINARY RELATION */ |
190 | case 0x2AFD: /* DOUBLE SOLIDUS OPERATOR */ |
191 | case 0x2FF0: /* IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT */ |
192 | case 0x2FF1: /* IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO BELOW */ |
193 | case 0x2FF2: /* IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT */ |
194 | case 0x2FF3: /* IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW */ |
195 | case 0x2FF4: /* IDEOGRAPHIC DESCRIPTION CHARACTER FULL SURROUND */ |
196 | case 0x2FF5: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE */ |
197 | case 0x2FF6: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM BELOW */ |
198 | case 0x2FF7: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LEFT */ |
199 | case 0x2FF8: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER LEFT */ |
200 | case 0x2FF9: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER RIGHT */ |
201 | case 0x2FFA: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER LEFT */ |
202 | case 0x2FFB: /* IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID */ |
203 | case 0x3002: /* IDEOGRAPHIC FULL STOP */ |
204 | case 0x3008: /* LEFT ANGLE BRACKET */ |
205 | case 0x3014: /* LEFT TORTOISE SHELL BRACKET */ |
206 | case 0x3015: /* RIGHT TORTOISE SHELL BRACKET */ |
207 | case 0x3033: /* VERTICAL KANA REPEAT MARK UPPER HALF */ |
208 | case 0x3035: /* VERTICAL KANA REPEAT MARK LOWER HALF */ |
209 | case 0x321D: /* PARENTHESIZED KOREAN CHARACTER OJEON */ |
210 | case 0x321E: /* PARENTHESIZED KOREAN CHARACTER O HU */ |
211 | case 0x33AE: /* SQUARE RAD OVER S */ |
212 | case 0x33AF: /* SQUARE RAD OVER S SQUARED */ |
213 | case 0x33C6: /* SQUARE C OVER KG */ |
214 | case 0x33DF: /* SQUARE A OVER M */ |
215 | case 0x05B9: /* HEBREW POINT HOLAM */ |
216 | case 0x05BA: /* HEBREW POINT HOLAM HASER FOR VAV */ |
217 | case 0x05C1: /* HEBREW POINT SHIN DOT */ |
218 | case 0x05C2: /* HEBREW POINT SIN DOT */ |
219 | case 0x05C4: /* HEBREW MARK UPPER DOT */ |
220 | case 0xA731: /* LATIN LETTER SMALL CAPITAL S */ |
221 | case 0xA771: /* LATIN SMALL LETTER DUM */ |
222 | case 0xA789: /* MODIFIER LETTER COLON */ |
223 | case 0xFE14: /* PRESENTATION FORM FOR VERTICAL SEMICOLON */ |
224 | case 0xFE15: /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */ |
225 | case 0xFE3F: /* PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET */ |
226 | case 0xFE5D: /* SMALL LEFT TORTOISE SHELL BRACKET */ |
227 | case 0xFE5E: /* SMALL RIGHT TORTOISE SHELL BRACKET */ |
228 | case 0xFF0E: /* FULLWIDTH FULL STOP */ |
229 | case 0xFF0F: /* FULL WIDTH SOLIDUS */ |
230 | case 0xFF61: /* HALFWIDTH IDEOGRAPHIC FULL STOP */ |
231 | case 0xFFFC: /* OBJECT REPLACEMENT CHARACTER */ |
232 | case 0xFFFD: /* REPLACEMENT CHARACTER */ |
233 | case 0x1F50F: /* LOCK WITH INK PEN */ |
234 | case 0x1F510: /* CLOSED LOCK WITH KEY */ |
235 | case 0x1F511: /* KEY */ |
236 | case 0x1F512: /* LOCK */ |
237 | case 0x1F513: /* OPEN LOCK */ |
238 | return true; |
239 | case 0x0307: /* COMBINING DOT ABOVE */ |
240 | return previousCodePoint == 0x0237 /* LATIN SMALL LETTER DOTLESS J */ |
241 | || previousCodePoint == 0x0131 /* LATIN SMALL LETTER DOTLESS I */ |
242 | || previousCodePoint == 0x05D5; /* HEBREW LETTER VAV */ |
243 | case 0x0548: /* ARMENIAN CAPITAL LETTER VO */ |
244 | case 0x054D: /* ARMENIAN CAPITAL LETTER SEH */ |
245 | case 0x0578: /* ARMENIAN SMALL LETTER VO */ |
246 | case 0x057D: /* ARMENIAN SMALL LETTER SEH */ |
247 | return previousCodePoint |
248 | && !isASCIIDigitOrValidHostCharacter(previousCodePoint.value()) |
249 | && !isArmenianScriptCharacter(previousCodePoint.value()); |
250 | case '.': |
251 | return false; |
252 | default: |
253 | return previousCodePoint |
254 | && isArmenianLookalikeCharacter(previousCodePoint.value()) |
255 | && !(isArmenianScriptCharacter(charCode) || isASCIIDigitOrValidHostCharacter(charCode)); |
256 | } |
257 | } |
258 | |
259 | void whiteListIDNScript(const char* scriptName) |
260 | { |
261 | int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, scriptName); |
262 | if (script >= 0 && script < USCRIPT_CODE_LIMIT) { |
263 | size_t index = script / 32; |
264 | uint32_t mask = 1 << (script % 32); |
265 | IDNScriptWhiteList[index] |= mask; |
266 | } |
267 | } |
268 | |
269 | void initializeDefaultIDNScriptWhiteList() |
270 | { |
271 | const char* defaultIDNScriptWhiteList[20] = { |
272 | "Common" , |
273 | "Inherited" , |
274 | "Arabic" , |
275 | "Armenian" , |
276 | "Bopomofo" , |
277 | "Canadian_Aboriginal" , |
278 | "Devanagari" , |
279 | "Deseret" , |
280 | "Gujarati" , |
281 | "Gurmukhi" , |
282 | "Hangul" , |
283 | "Han" , |
284 | "Hebrew" , |
285 | "Hiragana" , |
286 | "Katakana_Or_Hiragana" , |
287 | "Katakana" , |
288 | "Latin" , |
289 | "Tamil" , |
290 | "Thai" , |
291 | "Yi" , |
292 | }; |
293 | for (const char* scriptName : defaultIDNScriptWhiteList) |
294 | whiteListIDNScript(scriptName); |
295 | } |
296 | |
297 | static bool allCharactersInIDNScriptWhiteList(const UChar* buffer, int32_t length) |
298 | { |
299 | loadIDNScriptWhiteList(); |
300 | int32_t i = 0; |
301 | Optional<UChar32> previousCodePoint; |
302 | while (i < length) { |
303 | UChar32 c; |
304 | U16_NEXT(buffer, i, length, c) |
305 | UErrorCode error = U_ZERO_ERROR; |
306 | UScriptCode script = uscript_getScript(c, &error); |
307 | if (error != U_ZERO_ERROR) { |
308 | LOG_ERROR("got ICU error while trying to look at scripts: %d" , error); |
309 | return false; |
310 | } |
311 | if (script < 0) { |
312 | LOG_ERROR("got negative number for script code from ICU: %d" , script); |
313 | return false; |
314 | } |
315 | if (script >= USCRIPT_CODE_LIMIT) |
316 | return false; |
317 | |
318 | size_t index = script / 32; |
319 | uint32_t mask = 1 << (script % 32); |
320 | if (!(IDNScriptWhiteList[index] & mask)) |
321 | return false; |
322 | |
323 | if (isLookalikeCharacter(previousCodePoint, c)) |
324 | return false; |
325 | previousCodePoint = c; |
326 | } |
327 | return true; |
328 | } |
329 | |
330 | template<typename Func> |
331 | static inline bool isSecondLevelDomainNameAllowedByTLDRules(const UChar* buffer, int32_t length, Func characterIsAllowed) |
332 | { |
333 | ASSERT(length > 0); |
334 | |
335 | for (int32_t i = length - 1; i >= 0; --i) { |
336 | UChar ch = buffer[i]; |
337 | |
338 | if (characterIsAllowed(ch)) |
339 | continue; |
340 | |
341 | // Only check the second level domain. Lower level registrars may have different rules. |
342 | if (ch == '.') |
343 | break; |
344 | |
345 | return false; |
346 | } |
347 | return true; |
348 | } |
349 | |
350 | #define CHECK_RULES_IF_SUFFIX_MATCHES(suffix, function) \ |
351 | { \ |
352 | static const int32_t suffixLength = sizeof(suffix) / sizeof(suffix[0]); \ |
353 | if (length > suffixLength && !memcmp(buffer + length - suffixLength, suffix, sizeof(suffix))) \ |
354 | return isSecondLevelDomainNameAllowedByTLDRules(buffer, length - suffixLength, function); \ |
355 | } |
356 | |
357 | static bool isRussianDomainNameCharacter(UChar ch) |
358 | { |
359 | // Only modern Russian letters, digits and dashes are allowed. |
360 | return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || isASCIIDigit(ch) || ch == '-'; |
361 | } |
362 | |
363 | static bool allCharactersAllowedByTLDRules(const UChar* buffer, int32_t length) |
364 | { |
365 | // Skip trailing dot for root domain. |
366 | if (buffer[length - 1] == '.') |
367 | length--; |
368 | |
369 | // http://cctld.ru/files/pdf/docs/rules_ru-rf.pdf |
370 | static const UChar cyrillicRF[] = { |
371 | '.', |
372 | 0x0440, // CYRILLIC SMALL LETTER ER |
373 | 0x0444, // CYRILLIC SMALL LETTER EF |
374 | }; |
375 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRF, isRussianDomainNameCharacter); |
376 | |
377 | // http://rusnames.ru/rules.pl |
378 | static const UChar cyrillicRUS[] = { |
379 | '.', |
380 | 0x0440, // CYRILLIC SMALL LETTER ER |
381 | 0x0443, // CYRILLIC SMALL LETTER U |
382 | 0x0441, // CYRILLIC SMALL LETTER ES |
383 | }; |
384 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRUS, isRussianDomainNameCharacter); |
385 | |
386 | // http://ru.faitid.org/projects/moscow/documents/moskva/idn |
387 | static const UChar cyrillicMOSKVA[] = { |
388 | '.', |
389 | 0x043C, // CYRILLIC SMALL LETTER EM |
390 | 0x043E, // CYRILLIC SMALL LETTER O |
391 | 0x0441, // CYRILLIC SMALL LETTER ES |
392 | 0x043A, // CYRILLIC SMALL LETTER KA |
393 | 0x0432, // CYRILLIC SMALL LETTER VE |
394 | 0x0430, // CYRILLIC SMALL LETTER A |
395 | }; |
396 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMOSKVA, isRussianDomainNameCharacter); |
397 | |
398 | // http://www.dotdeti.ru/foruser/docs/regrules.php |
399 | static const UChar cyrillicDETI[] = { |
400 | '.', |
401 | 0x0434, // CYRILLIC SMALL LETTER DE |
402 | 0x0435, // CYRILLIC SMALL LETTER IE |
403 | 0x0442, // CYRILLIC SMALL LETTER TE |
404 | 0x0438, // CYRILLIC SMALL LETTER I |
405 | }; |
406 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicDETI, isRussianDomainNameCharacter); |
407 | |
408 | // http://corenic.org - rules not published. The word is Russian, so only allowing Russian at this time, |
409 | // although we may need to revise the checks if this ends up being used with other languages spoken in Russia. |
410 | static const UChar cyrillicONLAYN[] = { |
411 | '.', |
412 | 0x043E, // CYRILLIC SMALL LETTER O |
413 | 0x043D, // CYRILLIC SMALL LETTER EN |
414 | 0x043B, // CYRILLIC SMALL LETTER EL |
415 | 0x0430, // CYRILLIC SMALL LETTER A |
416 | 0x0439, // CYRILLIC SMALL LETTER SHORT I |
417 | 0x043D, // CYRILLIC SMALL LETTER EN |
418 | }; |
419 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicONLAYN, isRussianDomainNameCharacter); |
420 | |
421 | // http://corenic.org - same as above. |
422 | static const UChar cyrillicSAYT[] = { |
423 | '.', |
424 | 0x0441, // CYRILLIC SMALL LETTER ES |
425 | 0x0430, // CYRILLIC SMALL LETTER A |
426 | 0x0439, // CYRILLIC SMALL LETTER SHORT I |
427 | 0x0442, // CYRILLIC SMALL LETTER TE |
428 | }; |
429 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSAYT, isRussianDomainNameCharacter); |
430 | |
431 | // http://pir.org/products/opr-domain/ - rules not published. According to the registry site, |
432 | // the intended audience is "Russian and other Slavic-speaking markets". |
433 | // Chrome appears to only allow Russian, so sticking with that for now. |
434 | static const UChar cyrillicORG[] = { |
435 | '.', |
436 | 0x043E, // CYRILLIC SMALL LETTER O |
437 | 0x0440, // CYRILLIC SMALL LETTER ER |
438 | 0x0433, // CYRILLIC SMALL LETTER GHE |
439 | }; |
440 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicORG, isRussianDomainNameCharacter); |
441 | |
442 | // http://cctld.by/rules.html |
443 | static const UChar cyrillicBEL[] = { |
444 | '.', |
445 | 0x0431, // CYRILLIC SMALL LETTER BE |
446 | 0x0435, // CYRILLIC SMALL LETTER IE |
447 | 0x043B, // CYRILLIC SMALL LETTER EL |
448 | }; |
449 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBEL, [](UChar ch) { |
450 | // Russian and Byelorussian letters, digits and dashes are allowed. |
451 | return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x0456 || ch == 0x045E || ch == 0x2019 || isASCIIDigit(ch) || ch == '-'; |
452 | }); |
453 | |
454 | // http://www.nic.kz/docs/poryadok_vnedreniya_kaz_ru.pdf |
455 | static const UChar cyrillicKAZ[] = { |
456 | '.', |
457 | 0x049B, // CYRILLIC SMALL LETTER KA WITH DESCENDER |
458 | 0x0430, // CYRILLIC SMALL LETTER A |
459 | 0x0437, // CYRILLIC SMALL LETTER ZE |
460 | }; |
461 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicKAZ, [](UChar ch) { |
462 | // Kazakh letters, digits and dashes are allowed. |
463 | return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x04D9 || ch == 0x0493 || ch == 0x049B || ch == 0x04A3 || ch == 0x04E9 || ch == 0x04B1 || ch == 0x04AF || ch == 0x04BB || ch == 0x0456 || isASCIIDigit(ch) || ch == '-'; |
464 | }); |
465 | |
466 | // http://uanic.net/docs/documents-ukr/Rules%20of%20UKR_v4.0.pdf |
467 | static const UChar cyrillicUKR[] = { |
468 | '.', |
469 | 0x0443, // CYRILLIC SMALL LETTER U |
470 | 0x043A, // CYRILLIC SMALL LETTER KA |
471 | 0x0440, // CYRILLIC SMALL LETTER ER |
472 | }; |
473 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicUKR, [](UChar ch) { |
474 | // Russian and Ukrainian letters, digits and dashes are allowed. |
475 | return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x0491 || ch == 0x0404 || ch == 0x0456 || ch == 0x0457 || isASCIIDigit(ch) || ch == '-'; |
476 | }); |
477 | |
478 | // http://www.rnids.rs/data/DOKUMENTI/idn-srb-policy-termsofuse-v1.4-eng.pdf |
479 | static const UChar cyrillicSRB[] = { |
480 | '.', |
481 | 0x0441, // CYRILLIC SMALL LETTER ES |
482 | 0x0440, // CYRILLIC SMALL LETTER ER |
483 | 0x0431, // CYRILLIC SMALL LETTER BE |
484 | }; |
485 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSRB, [](UChar ch) { |
486 | // Serbian letters, digits and dashes are allowed. |
487 | return (ch >= 0x0430 && ch <= 0x0438) || (ch >= 0x043A && ch <= 0x0448) || ch == 0x0452 || ch == 0x0458 || ch == 0x0459 || ch == 0x045A || ch == 0x045B || ch == 0x045F || isASCIIDigit(ch) || ch == '-'; |
488 | }); |
489 | |
490 | // http://marnet.mk/doc/pravilnik-mk-mkd.pdf |
491 | static const UChar cyrillicMKD[] = { |
492 | '.', |
493 | 0x043C, // CYRILLIC SMALL LETTER EM |
494 | 0x043A, // CYRILLIC SMALL LETTER KA |
495 | 0x0434, // CYRILLIC SMALL LETTER DE |
496 | }; |
497 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMKD, [](UChar ch) { |
498 | // Macedonian letters, digits and dashes are allowed. |
499 | return (ch >= 0x0430 && ch <= 0x0438) || (ch >= 0x043A && ch <= 0x0448) || ch == 0x0453 || ch == 0x0455 || ch == 0x0458 || ch == 0x0459 || ch == 0x045A || ch == 0x045C || ch == 0x045F || isASCIIDigit(ch) || ch == '-'; |
500 | }); |
501 | |
502 | // https://www.mon.mn/cs/ |
503 | static const UChar cyrillicMON[] = { |
504 | '.', |
505 | 0x043C, // CYRILLIC SMALL LETTER EM |
506 | 0x043E, // CYRILLIC SMALL LETTER O |
507 | 0x043D, // CYRILLIC SMALL LETTER EN |
508 | }; |
509 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMON, [](UChar ch) { |
510 | // Mongolian letters, digits and dashes are allowed. |
511 | return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x04E9 || ch == 0x04AF || isASCIIDigit(ch) || ch == '-'; |
512 | }); |
513 | |
514 | // https://www.icann.org/sites/default/files/packages/lgr/lgr-second-level-bulgarian-30aug16-en.html |
515 | static const UChar cyrillicBG[] = { |
516 | '.', |
517 | 0x0431, // CYRILLIC SMALL LETTER BE |
518 | 0x0433 // CYRILLIC SMALL LETTER GHE |
519 | }; |
520 | CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBG, [](UChar ch) { |
521 | return (ch >= 0x0430 && ch <= 0x044A) || ch == 0x044C || (ch >= 0x044E && ch <= 0x0450) || ch == 0x045D || isASCIIDigit(ch) || ch == '-'; |
522 | }); |
523 | |
524 | // Not a known top level domain with special rules. |
525 | return false; |
526 | } |
527 | |
528 | // Return value of null means no mapping is necessary. |
529 | Optional<String> mapHostName(const String& hostName, const Optional<URLDecodeFunction>& decodeFunction) |
530 | { |
531 | if (hostName.length() > hostNameBufferLength) |
532 | return String(); |
533 | |
534 | if (!hostName.length()) |
535 | return String(); |
536 | |
537 | String string; |
538 | if (decodeFunction && string.contains('%')) |
539 | string = (*decodeFunction)(hostName); |
540 | else |
541 | string = hostName; |
542 | |
543 | unsigned length = string.length(); |
544 | |
545 | auto sourceBuffer = string.charactersWithNullTermination(); |
546 | |
547 | UChar destinationBuffer[hostNameBufferLength]; |
548 | UErrorCode uerror = U_ZERO_ERROR; |
549 | UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER; |
550 | int32_t numCharactersConverted = (decodeFunction ? uidna_nameToASCII : uidna_nameToUnicode)(&URLParser::internationalDomainNameTranscoder(), sourceBuffer.data(), length, destinationBuffer, hostNameBufferLength, &processingDetails, &uerror); |
551 | if (length && (U_FAILURE(uerror) || processingDetails.errors)) |
552 | return nullopt; |
553 | |
554 | if (numCharactersConverted == static_cast<int32_t>(length) && !memcmp(sourceBuffer.data(), destinationBuffer, length * sizeof(UChar))) |
555 | return String(); |
556 | |
557 | if (!decodeFunction && !allCharactersInIDNScriptWhiteList(destinationBuffer, numCharactersConverted) && !allCharactersAllowedByTLDRules(destinationBuffer, numCharactersConverted)) |
558 | return String(); |
559 | |
560 | return String(destinationBuffer, numCharactersConverted); |
561 | } |
562 | |
563 | using MappingRangesVector = Optional<Vector<std::tuple<unsigned, unsigned, String>>>; |
564 | |
565 | static void collectRangesThatNeedMapping(const String& string, unsigned location, unsigned length, MappingRangesVector& array, const Optional<URLDecodeFunction>& decodeFunction) |
566 | { |
567 | // Generally, we want to optimize for the case where there is one host name that does not need mapping. |
568 | // Therefore, we use null to indicate no mapping here and an empty array to indicate error. |
569 | |
570 | String substring = string.substringSharingImpl(location, length); |
571 | Optional<String> host = mapHostName(substring, decodeFunction); |
572 | |
573 | if (host && !*host) |
574 | return; |
575 | |
576 | if (!array) |
577 | array = Vector<std::tuple<unsigned, unsigned, String>>(); |
578 | |
579 | if (host) |
580 | array->constructAndAppend(location, length, *host); |
581 | } |
582 | |
583 | static void applyHostNameFunctionToMailToURLString(const String& string, const Optional<URLDecodeFunction>& decodeFunction, MappingRangesVector& array) |
584 | { |
585 | // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' character. |
586 | // Skip quoted strings so that characters in them don't confuse us. |
587 | // When we find a '?' character, we are past the part of the URL that contains host names. |
588 | |
589 | unsigned stringLength = string.length(); |
590 | unsigned current = 0; |
591 | |
592 | while (1) { |
593 | // Find start of host name or of quoted string. |
594 | auto hostNameOrStringStart = string.find([](UChar ch) { |
595 | return ch == '"' || ch == '@' || ch == '?'; |
596 | }, current); |
597 | if (hostNameOrStringStart == notFound) |
598 | return; |
599 | |
600 | UChar c = string[hostNameOrStringStart]; |
601 | current = hostNameOrStringStart + 1; |
602 | |
603 | if (c == '?') |
604 | return; |
605 | |
606 | if (c == '@') { |
607 | // Find end of host name. |
608 | unsigned hostNameStart = current; |
609 | auto hostNameEnd = string.find([](UChar ch) { |
610 | return ch == '>' || ch == ',' || ch == '?'; |
611 | }, current); |
612 | |
613 | bool done; |
614 | if (hostNameEnd == notFound) { |
615 | hostNameEnd = stringLength; |
616 | done = true; |
617 | } else { |
618 | current = hostNameEnd; |
619 | done = false; |
620 | } |
621 | |
622 | // Process host name range. |
623 | collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction); |
624 | |
625 | if (done) |
626 | return; |
627 | } else { |
628 | // Skip quoted string. |
629 | ASSERT(c == '"'); |
630 | while (1) { |
631 | auto escapedCharacterOrStringEnd = string.find([](UChar ch) { |
632 | return ch == '"' || ch == '\\'; |
633 | }, current); |
634 | if (escapedCharacterOrStringEnd == notFound) |
635 | return; |
636 | |
637 | c = string[escapedCharacterOrStringEnd]; |
638 | current = escapedCharacterOrStringEnd + 1; |
639 | |
640 | // If we are the end of the string, then break from the string loop back to the host name loop. |
641 | if (c == '"') |
642 | break; |
643 | |
644 | // Skip escaped character. |
645 | ASSERT(c == '\\'); |
646 | if (current == stringLength) |
647 | return; |
648 | |
649 | ++current; |
650 | } |
651 | } |
652 | } |
653 | } |
654 | |
655 | static void applyHostNameFunctionToURLString(const String& string, const Optional<URLDecodeFunction>& decodeFunction, MappingRangesVector& array) |
656 | { |
657 | // Find hostnames. Too bad we can't use any real URL-parsing code to do this, |
658 | // but we have to do it before doing all the %-escaping, and this is the only |
659 | // code we have that parses mailto URLs anyway. |
660 | |
661 | // Maybe we should implement this using a character buffer instead? |
662 | |
663 | if (protocolIs(string, "mailto" )) { |
664 | applyHostNameFunctionToMailToURLString(string, decodeFunction, array); |
665 | return; |
666 | } |
667 | |
668 | // Find the host name in a hierarchical URL. |
669 | // It comes after a "://" sequence, with scheme characters preceding. |
670 | // If ends with the end of the string or a ":", "/", or a "?". |
671 | // If there is a "@" character, the host part is just the part after the "@". |
672 | static const char* separator = "://" ; |
673 | auto separatorIndex = string.find(separator); |
674 | if (separatorIndex == notFound) |
675 | return; |
676 | |
677 | unsigned authorityStart = separatorIndex + strlen(separator); |
678 | |
679 | // Check that all characters before the :// are valid scheme characters. |
680 | if (StringView { string }.left(separatorIndex).contains([](UChar character) { |
681 | return !(isASCIIAlphanumeric(character) || character == '+' || character == '-' || character == '.'); |
682 | })) |
683 | return; |
684 | |
685 | // Find terminating character. |
686 | auto hostNameTerminator = string.find([](UChar character) { |
687 | return character == ':' || character == '/' || character == '?' || character == '#'; |
688 | }, authorityStart); |
689 | unsigned hostNameEnd = hostNameTerminator == notFound ? string.length() : hostNameTerminator; |
690 | |
691 | // Find "@" for the start of the host name. |
692 | auto userInfoTerminator = StringView { string }.left(hostNameEnd).find('@', authorityStart); |
693 | unsigned hostNameStart = userInfoTerminator == notFound ? authorityStart : userInfoTerminator + 1; |
694 | |
695 | collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction); |
696 | } |
697 | |
698 | String mapHostNames(const String& string, const Optional<URLDecodeFunction>& decodeFunction) |
699 | { |
700 | // Generally, we want to optimize for the case where there is one host name that does not need mapping. |
701 | |
702 | if (decodeFunction && string.isAllASCII()) |
703 | return string; |
704 | |
705 | // Make a list of ranges that actually need mapping. |
706 | MappingRangesVector hostNameRanges; |
707 | applyHostNameFunctionToURLString(string, decodeFunction, hostNameRanges); |
708 | if (!hostNameRanges) |
709 | return string; |
710 | |
711 | if (hostNameRanges->isEmpty()) |
712 | return { }; |
713 | |
714 | // Do the mapping. |
715 | String result = string; |
716 | while (!hostNameRanges->isEmpty()) { |
717 | auto [location, length, mappedHostName] = hostNameRanges->takeLast(); |
718 | result = result.replace(location, length, mappedHostName); |
719 | } |
720 | return result; |
721 | } |
722 | |
723 | static String escapeUnsafeCharacters(const String& sourceBuffer) |
724 | { |
725 | unsigned length = sourceBuffer.length(); |
726 | |
727 | Optional<UChar32> previousCodePoint; |
728 | |
729 | unsigned i; |
730 | for (i = 0; i < length; ) { |
731 | UChar32 c = sourceBuffer.characterStartingAt(i); |
732 | if (isLookalikeCharacter(previousCodePoint, sourceBuffer.characterStartingAt(i))) |
733 | break; |
734 | previousCodePoint = c; |
735 | i += U16_LENGTH(c); |
736 | } |
737 | |
738 | if (i == length) |
739 | return sourceBuffer; |
740 | |
741 | Vector<UChar, urlBytesBufferLength> outBuffer; |
742 | |
743 | outBuffer.grow(i); |
744 | if (sourceBuffer.is8Bit()) |
745 | StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters8(), i); |
746 | else |
747 | StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters16(), i); |
748 | |
749 | for (; i < length; ) { |
750 | UChar32 c = sourceBuffer.characterStartingAt(i); |
751 | unsigned characterLength = U16_LENGTH(c); |
752 | if (isLookalikeCharacter(previousCodePoint, c)) { |
753 | uint8_t utf8Buffer[4]; |
754 | size_t offset = 0; |
755 | UBool failure = false; |
756 | U8_APPEND(utf8Buffer, offset, 4, c, failure); |
757 | ASSERT_UNUSED(failure, !failure); |
758 | |
759 | for (size_t j = 0; j < offset; ++j) { |
760 | outBuffer.append('%'); |
761 | outBuffer.append(upperNibbleToASCIIHexDigit(utf8Buffer[j])); |
762 | outBuffer.append(lowerNibbleToASCIIHexDigit(utf8Buffer[j])); |
763 | } |
764 | } else { |
765 | for (unsigned j = 0; j < characterLength; ++j) |
766 | outBuffer.append(sourceBuffer[i + j]); |
767 | } |
768 | previousCodePoint = c; |
769 | i += characterLength; |
770 | } |
771 | |
772 | return String::adopt(WTFMove(outBuffer)); |
773 | } |
774 | |
775 | String userVisibleURL(const CString& url) |
776 | { |
777 | auto* before = reinterpret_cast<const unsigned char*>(url.data()); |
778 | int length = url.length(); |
779 | |
780 | if (!length) |
781 | return { }; |
782 | |
783 | bool mayNeedHostNameDecoding = false; |
784 | |
785 | Checked<int, RecordOverflow> bufferLength = length; |
786 | bufferLength = bufferLength * 3 + 1; // The buffer should be large enough to %-escape every character. |
787 | if (bufferLength.hasOverflowed()) |
788 | return { }; |
789 | Vector<char, urlBytesBufferLength> after(bufferLength.unsafeGet()); |
790 | |
791 | char* q = after.data(); |
792 | { |
793 | const unsigned char* p = before; |
794 | for (int i = 0; i < length; i++) { |
795 | unsigned char c = p[i]; |
796 | // unescape escape sequences that indicate bytes greater than 0x7f |
797 | if (c == '%' && i + 2 < length && isASCIIHexDigit(p[i + 1]) && isASCIIHexDigit(p[i + 2])) { |
798 | auto u = toASCIIHexValue(p[i + 1], p[i + 2]); |
799 | if (u > 0x7f) { |
800 | // unescape |
801 | *q++ = u; |
802 | } else { |
803 | // do not unescape |
804 | *q++ = p[i]; |
805 | *q++ = p[i + 1]; |
806 | *q++ = p[i + 2]; |
807 | } |
808 | i += 2; |
809 | } else { |
810 | *q++ = c; |
811 | |
812 | // Check for "xn--" in an efficient, non-case-sensitive, way. |
813 | if (c == '-' && i >= 3 && !mayNeedHostNameDecoding && (q[-4] | 0x20) == 'x' && (q[-3] | 0x20) == 'n' && q[-2] == '-') |
814 | mayNeedHostNameDecoding = true; |
815 | } |
816 | } |
817 | *q = '\0'; |
818 | } |
819 | |
820 | // Check string to see if it can be converted to display using UTF-8 |
821 | String result = String::fromUTF8(after.data()); |
822 | if (!result) { |
823 | // Could not convert to UTF-8. |
824 | // Convert characters greater than 0x7f to escape sequences. |
825 | // Shift current string to the end of the buffer |
826 | // then we will copy back bytes to the start of the buffer |
827 | // as we convert. |
828 | int afterlength = q - after.data(); |
829 | char* p = after.data() + bufferLength.unsafeGet() - afterlength - 1; |
830 | memmove(p, after.data(), afterlength + 1); // copies trailing '\0' |
831 | char* q = after.data(); |
832 | while (*p) { |
833 | unsigned char c = *p; |
834 | if (c > 0x7f) { |
835 | *q++ = '%'; |
836 | *q++ = upperNibbleToASCIIHexDigit(c); |
837 | *q++ = lowerNibbleToASCIIHexDigit(c); |
838 | } else |
839 | *q++ = *p; |
840 | p++; |
841 | } |
842 | *q = '\0'; |
843 | // Note: after.data() points to a null-terminated, pure ASCII string. |
844 | result = String::fromUTF8(after.data()); |
845 | ASSERT(!!result); |
846 | } |
847 | |
848 | // Note: result is UTF–16 string, created from either a valid UTF-8 string, |
849 | // or a pure ASCII string (where all bytes with the high bit set are |
850 | // percent-encoded). |
851 | |
852 | if (mayNeedHostNameDecoding) { |
853 | // FIXME: Is it good to ignore the failure of mapHostNames and keep result intact? |
854 | auto mappedResult = mapHostNames(result, nullopt); |
855 | if (!!mappedResult) |
856 | result = mappedResult; |
857 | } |
858 | |
859 | return escapeUnsafeCharacters(normalizedNFC(result)); |
860 | } |
861 | |
862 | } // namespace URLHelpers |
863 | } // namespace WTF |
864 | |