1/*
2 * Copyright (C) 2016-2018 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include <wtf/URLParser.h>
28
29#include <array>
30#include <mutex>
31#include <unicode/uidna.h>
32#include <unicode/utf8.h>
33#include <unicode/utypes.h>
34
35namespace WTF {
36
37#define URL_PARSER_DEBUGGING 0
38
39#if URL_PARSER_DEBUGGING
40#define URL_PARSER_LOG(...) WTFLogAlways(__VA_ARGS__)
41#else
42#define URL_PARSER_LOG(...)
43#endif
44
45template<typename CharacterType>
46class CodePointIterator {
47public:
48 ALWAYS_INLINE CodePointIterator() { }
49 ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
50 : m_begin(begin)
51 , m_end(end)
52 {
53 }
54
55 ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
56 : CodePointIterator(begin.m_begin, end.m_begin)
57 {
58 ASSERT(end.m_begin >= begin.m_begin);
59 }
60
61 ALWAYS_INLINE UChar32 operator*() const;
62 ALWAYS_INLINE CodePointIterator& operator++();
63
64 ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
65 {
66 return m_begin == other.m_begin
67 && m_end == other.m_end;
68 }
69 ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
70
71 ALWAYS_INLINE bool atEnd() const
72 {
73 ASSERT(m_begin <= m_end);
74 return m_begin >= m_end;
75 }
76
77 ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
78 {
79 ASSERT(m_begin >= reference);
80 return m_begin - reference;
81 }
82
83 ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
84 {
85 return codeUnitsSince(other.m_begin);
86 }
87
88private:
89 const CharacterType* m_begin { nullptr };
90 const CharacterType* m_end { nullptr };
91};
92
93template<>
94ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const
95{
96 ASSERT(!atEnd());
97 return *m_begin;
98}
99
100template<>
101ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
102{
103 m_begin++;
104 return *this;
105}
106
107template<>
108ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const
109{
110 ASSERT(!atEnd());
111 UChar32 c;
112 U16_GET(m_begin, 0, 0, m_end - m_begin, c);
113 return c;
114}
115
116template<>
117ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
118{
119 unsigned i = 0;
120 size_t length = m_end - m_begin;
121 U16_FWD_1(m_begin, i, length);
122 m_begin += i;
123 return *this;
124}
125
126ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
127{
128 if (U_IS_BMP(codePoint)) {
129 destination.append(static_cast<UChar>(codePoint));
130 return;
131 }
132 destination.reserveCapacity(destination.size() + 2);
133 destination.uncheckedAppend(U16_LEAD(codePoint));
134 destination.uncheckedAppend(U16_TRAIL(codePoint));
135}
136
137enum URLCharacterClass {
138 UserInfo = 0x1,
139 Default = 0x2,
140 ForbiddenHost = 0x4,
141 QueryPercent = 0x8,
142 SlashQuestionOrHash = 0x10,
143 ValidScheme = 0x20,
144};
145
146static const uint8_t characterClassTable[256] = {
147 UserInfo | Default | QueryPercent | ForbiddenHost, // 0x0
148 UserInfo | Default | QueryPercent, // 0x1
149 UserInfo | Default | QueryPercent, // 0x2
150 UserInfo | Default | QueryPercent, // 0x3
151 UserInfo | Default | QueryPercent, // 0x4
152 UserInfo | Default | QueryPercent, // 0x5
153 UserInfo | Default | QueryPercent, // 0x6
154 UserInfo | Default | QueryPercent, // 0x7
155 UserInfo | Default | QueryPercent, // 0x8
156 UserInfo | Default | QueryPercent | ForbiddenHost, // 0x9
157 UserInfo | Default | QueryPercent | ForbiddenHost, // 0xA
158 UserInfo | Default | QueryPercent, // 0xB
159 UserInfo | Default | QueryPercent, // 0xC
160 UserInfo | Default | QueryPercent | ForbiddenHost, // 0xD
161 UserInfo | Default | QueryPercent, // 0xE
162 UserInfo | Default | QueryPercent, // 0xF
163 UserInfo | Default | QueryPercent, // 0x10
164 UserInfo | Default | QueryPercent, // 0x11
165 UserInfo | Default | QueryPercent, // 0x12
166 UserInfo | Default | QueryPercent, // 0x13
167 UserInfo | Default | QueryPercent, // 0x14
168 UserInfo | Default | QueryPercent, // 0x15
169 UserInfo | Default | QueryPercent, // 0x16
170 UserInfo | Default | QueryPercent, // 0x17
171 UserInfo | Default | QueryPercent, // 0x18
172 UserInfo | Default | QueryPercent, // 0x19
173 UserInfo | Default | QueryPercent, // 0x1A
174 UserInfo | Default | QueryPercent, // 0x1B
175 UserInfo | Default | QueryPercent, // 0x1C
176 UserInfo | Default | QueryPercent, // 0x1D
177 UserInfo | Default | QueryPercent, // 0x1E
178 UserInfo | Default | QueryPercent, // 0x1F
179 UserInfo | Default | QueryPercent | ForbiddenHost, // ' '
180 0, // '!'
181 UserInfo | Default | QueryPercent, // '"'
182 UserInfo | Default | QueryPercent | SlashQuestionOrHash | ForbiddenHost, // '#'
183 0, // '$'
184 ForbiddenHost, // '%'
185 0, // '&'
186 0, // '\''
187 0, // '('
188 0, // ')'
189 0, // '*'
190 ValidScheme, // '+'
191 0, // ','
192 ValidScheme, // '-'
193 ValidScheme, // '.'
194 UserInfo | SlashQuestionOrHash | ForbiddenHost, // '/'
195 ValidScheme, // '0'
196 ValidScheme, // '1'
197 ValidScheme, // '2'
198 ValidScheme, // '3'
199 ValidScheme, // '4'
200 ValidScheme, // '5'
201 ValidScheme, // '6'
202 ValidScheme, // '7'
203 ValidScheme, // '8'
204 ValidScheme, // '9'
205 UserInfo | ForbiddenHost, // ':'
206 UserInfo, // ';'
207 UserInfo | Default | QueryPercent | ForbiddenHost, // '<'
208 UserInfo, // '='
209 UserInfo | Default | QueryPercent | ForbiddenHost, // '>'
210 UserInfo | Default | SlashQuestionOrHash | ForbiddenHost, // '?'
211 UserInfo | ForbiddenHost, // '@'
212 ValidScheme, // 'A'
213 ValidScheme, // 'B'
214 ValidScheme, // 'C'
215 ValidScheme, // 'D'
216 ValidScheme, // 'E'
217 ValidScheme, // 'F'
218 ValidScheme, // 'G'
219 ValidScheme, // 'H'
220 ValidScheme, // 'I'
221 ValidScheme, // 'J'
222 ValidScheme, // 'K'
223 ValidScheme, // 'L'
224 ValidScheme, // 'M'
225 ValidScheme, // 'N'
226 ValidScheme, // 'O'
227 ValidScheme, // 'P'
228 ValidScheme, // 'Q'
229 ValidScheme, // 'R'
230 ValidScheme, // 'S'
231 ValidScheme, // 'T'
232 ValidScheme, // 'U'
233 ValidScheme, // 'V'
234 ValidScheme, // 'W'
235 ValidScheme, // 'X'
236 ValidScheme, // 'Y'
237 ValidScheme, // 'Z'
238 UserInfo | ForbiddenHost, // '['
239 UserInfo | SlashQuestionOrHash | ForbiddenHost, // '\\'
240 UserInfo | ForbiddenHost, // ']'
241 UserInfo, // '^'
242 0, // '_'
243 UserInfo | Default, // '`'
244 ValidScheme, // 'a'
245 ValidScheme, // 'b'
246 ValidScheme, // 'c'
247 ValidScheme, // 'd'
248 ValidScheme, // 'e'
249 ValidScheme, // 'f'
250 ValidScheme, // 'g'
251 ValidScheme, // 'h'
252 ValidScheme, // 'i'
253 ValidScheme, // 'j'
254 ValidScheme, // 'k'
255 ValidScheme, // 'l'
256 ValidScheme, // 'm'
257 ValidScheme, // 'n'
258 ValidScheme, // 'o'
259 ValidScheme, // 'p'
260 ValidScheme, // 'q'
261 ValidScheme, // 'r'
262 ValidScheme, // 's'
263 ValidScheme, // 't'
264 ValidScheme, // 'u'
265 ValidScheme, // 'v'
266 ValidScheme, // 'w'
267 ValidScheme, // 'x'
268 ValidScheme, // 'y'
269 ValidScheme, // 'z'
270 UserInfo | Default, // '{'
271 UserInfo, // '|'
272 UserInfo | Default, // '}'
273 0, // '~'
274 QueryPercent, // 0x7F
275 QueryPercent, // 0x80
276 QueryPercent, // 0x81
277 QueryPercent, // 0x82
278 QueryPercent, // 0x83
279 QueryPercent, // 0x84
280 QueryPercent, // 0x85
281 QueryPercent, // 0x86
282 QueryPercent, // 0x87
283 QueryPercent, // 0x88
284 QueryPercent, // 0x89
285 QueryPercent, // 0x8A
286 QueryPercent, // 0x8B
287 QueryPercent, // 0x8C
288 QueryPercent, // 0x8D
289 QueryPercent, // 0x8E
290 QueryPercent, // 0x8F
291 QueryPercent, // 0x90
292 QueryPercent, // 0x91
293 QueryPercent, // 0x92
294 QueryPercent, // 0x93
295 QueryPercent, // 0x94
296 QueryPercent, // 0x95
297 QueryPercent, // 0x96
298 QueryPercent, // 0x97
299 QueryPercent, // 0x98
300 QueryPercent, // 0x99
301 QueryPercent, // 0x9A
302 QueryPercent, // 0x9B
303 QueryPercent, // 0x9C
304 QueryPercent, // 0x9D
305 QueryPercent, // 0x9E
306 QueryPercent, // 0x9F
307 QueryPercent, // 0xA0
308 QueryPercent, // 0xA1
309 QueryPercent, // 0xA2
310 QueryPercent, // 0xA3
311 QueryPercent, // 0xA4
312 QueryPercent, // 0xA5
313 QueryPercent, // 0xA6
314 QueryPercent, // 0xA7
315 QueryPercent, // 0xA8
316 QueryPercent, // 0xA9
317 QueryPercent, // 0xAA
318 QueryPercent, // 0xAB
319 QueryPercent, // 0xAC
320 QueryPercent, // 0xAD
321 QueryPercent, // 0xAE
322 QueryPercent, // 0xAF
323 QueryPercent, // 0xB0
324 QueryPercent, // 0xB1
325 QueryPercent, // 0xB2
326 QueryPercent, // 0xB3
327 QueryPercent, // 0xB4
328 QueryPercent, // 0xB5
329 QueryPercent, // 0xB6
330 QueryPercent, // 0xB7
331 QueryPercent, // 0xB8
332 QueryPercent, // 0xB9
333 QueryPercent, // 0xBA
334 QueryPercent, // 0xBB
335 QueryPercent, // 0xBC
336 QueryPercent, // 0xBD
337 QueryPercent, // 0xBE
338 QueryPercent, // 0xBF
339 QueryPercent, // 0xC0
340 QueryPercent, // 0xC1
341 QueryPercent, // 0xC2
342 QueryPercent, // 0xC3
343 QueryPercent, // 0xC4
344 QueryPercent, // 0xC5
345 QueryPercent, // 0xC6
346 QueryPercent, // 0xC7
347 QueryPercent, // 0xC8
348 QueryPercent, // 0xC9
349 QueryPercent, // 0xCA
350 QueryPercent, // 0xCB
351 QueryPercent, // 0xCC
352 QueryPercent, // 0xCD
353 QueryPercent, // 0xCE
354 QueryPercent, // 0xCF
355 QueryPercent, // 0xD0
356 QueryPercent, // 0xD1
357 QueryPercent, // 0xD2
358 QueryPercent, // 0xD3
359 QueryPercent, // 0xD4
360 QueryPercent, // 0xD5
361 QueryPercent, // 0xD6
362 QueryPercent, // 0xD7
363 QueryPercent, // 0xD8
364 QueryPercent, // 0xD9
365 QueryPercent, // 0xDA
366 QueryPercent, // 0xDB
367 QueryPercent, // 0xDC
368 QueryPercent, // 0xDD
369 QueryPercent, // 0xDE
370 QueryPercent, // 0xDF
371 QueryPercent, // 0xE0
372 QueryPercent, // 0xE1
373 QueryPercent, // 0xE2
374 QueryPercent, // 0xE3
375 QueryPercent, // 0xE4
376 QueryPercent, // 0xE5
377 QueryPercent, // 0xE6
378 QueryPercent, // 0xE7
379 QueryPercent, // 0xE8
380 QueryPercent, // 0xE9
381 QueryPercent, // 0xEA
382 QueryPercent, // 0xEB
383 QueryPercent, // 0xEC
384 QueryPercent, // 0xED
385 QueryPercent, // 0xEE
386 QueryPercent, // 0xEF
387 QueryPercent, // 0xF0
388 QueryPercent, // 0xF1
389 QueryPercent, // 0xF2
390 QueryPercent, // 0xF3
391 QueryPercent, // 0xF4
392 QueryPercent, // 0xF5
393 QueryPercent, // 0xF6
394 QueryPercent, // 0xF7
395 QueryPercent, // 0xF8
396 QueryPercent, // 0xF9
397 QueryPercent, // 0xFA
398 QueryPercent, // 0xFB
399 QueryPercent, // 0xFC
400 QueryPercent, // 0xFD
401 QueryPercent, // 0xFE
402 QueryPercent, // 0xFF
403};
404
405template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; }
406template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; }
407template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; }
408template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); }
409template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; }
410template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; }
411template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; }
412template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; }
413template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; }
414template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= ']' && characterClassTable[character] & ForbiddenHost; }
415ALWAYS_INLINE static bool shouldPercentEncodeQueryByte(uint8_t byte, const bool& urlIsSpecial)
416{
417 if (characterClassTable[byte] & QueryPercent)
418 return true;
419 if (byte == '\'' && urlIsSpecial)
420 return true;
421 return false;
422}
423
424bool URLParser::isInUserInfoEncodeSet(UChar c)
425{
426 return WTF::isInUserInfoEncodeSet(c);
427}
428
429template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
430ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
431{
432 ++iterator;
433 while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
434 if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
435 syntaxViolation(iteratorForSyntaxViolationPosition);
436 ++iterator;
437 }
438}
439
440template<typename CharacterType>
441bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
442{
443 if (iterator.atEnd())
444 return false;
445 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
446 if (iterator.atEnd())
447 return false;
448 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
449 return iterator.atEnd();
450}
451
452template<typename CharacterType>
453ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
454{
455 if (iterator.atEnd() || !isASCIIAlpha(*iterator))
456 return false;
457 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
458 if (iterator.atEnd())
459 return false;
460 if (*iterator == ':')
461 return true;
462 if (UNLIKELY(*iterator == '|'))
463 return true;
464 return false;
465}
466
467ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
468{
469 ASSERT(isASCII(codePoint));
470 if (UNLIKELY(m_didSeeSyntaxViolation))
471 m_asciiBuffer.append(codePoint);
472}
473
474ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
475{
476 if (UNLIKELY(m_didSeeSyntaxViolation))
477 m_asciiBuffer.append(characters, length);
478}
479
480template<typename CharacterType>
481void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
482{
483 ASSERT(isWindowsDriveLetter(iterator));
484 appendToASCIIBuffer(*iterator);
485 advance(iterator);
486 ASSERT(!iterator.atEnd());
487 ASSERT(*iterator == ':' || *iterator == '|');
488 if (*iterator == '|')
489 syntaxViolation(iterator);
490 appendToASCIIBuffer(':');
491 advance(iterator);
492}
493
494bool URLParser::copyBaseWindowsDriveLetter(const URL& base)
495{
496 if (base.protocolIs("file")) {
497 RELEASE_ASSERT(base.m_hostEnd + base.m_portLength < base.m_string.length());
498 if (base.m_string.is8Bit()) {
499 const LChar* begin = base.m_string.characters8();
500 CodePointIterator<LChar> c(begin + base.m_hostEnd + base.m_portLength + 1, begin + base.m_string.length());
501 if (isWindowsDriveLetter(c)) {
502 appendWindowsDriveLetter(c);
503 return true;
504 }
505 } else {
506 const UChar* begin = base.m_string.characters16();
507 CodePointIterator<UChar> c(begin + base.m_hostEnd + base.m_portLength + 1, begin + base.m_string.length());
508 if (isWindowsDriveLetter(c)) {
509 appendWindowsDriveLetter(c);
510 return true;
511 }
512 }
513 }
514 return false;
515}
516
517template<typename CharacterType>
518bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
519{
520 if (!isWindowsDriveLetter(iterator))
521 return true;
522 if (iterator.atEnd())
523 return false;
524 advance(iterator);
525 if (iterator.atEnd())
526 return true;
527 advance(iterator);
528 if (iterator.atEnd())
529 return true;
530 return !isSlashQuestionOrHash(*iterator);
531}
532
533static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
534{
535 buffer.append('%');
536 buffer.append(upperNibbleToASCIIHexDigit(byte));
537 buffer.append(lowerNibbleToASCIIHexDigit(byte));
538}
539
540void URLParser::percentEncodeByte(uint8_t byte)
541{
542 ASSERT(m_didSeeSyntaxViolation);
543 appendToASCIIBuffer('%');
544 appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
545 appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
546}
547
548const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD";
549const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1;
550
551template<bool(*isInCodeSet)(UChar32), typename CharacterType>
552ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
553{
554 ASSERT(!iterator.atEnd());
555 UChar32 codePoint = *iterator;
556 if (LIKELY(isASCII(codePoint))) {
557 if (UNLIKELY(isInCodeSet(codePoint))) {
558 syntaxViolation(iterator);
559 percentEncodeByte(codePoint);
560 } else
561 appendToASCIIBuffer(codePoint);
562 return;
563 }
564 ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
565 syntaxViolation(iterator);
566
567 if (!U_IS_UNICODE_CHAR(codePoint)) {
568 appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
569 return;
570 }
571
572 uint8_t buffer[U8_MAX_LENGTH];
573 int32_t offset = 0;
574 U8_APPEND_UNSAFE(buffer, offset, codePoint);
575 for (int32_t i = 0; i < offset; ++i)
576 percentEncodeByte(buffer[i]);
577}
578
579template<typename CharacterType>
580ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
581{
582 ASSERT(!iterator.atEnd());
583 UChar32 codePoint = *iterator;
584 if (LIKELY(isASCII(codePoint))) {
585 if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint, m_urlIsSpecial))) {
586 syntaxViolation(iterator);
587 percentEncodeByte(codePoint);
588 } else
589 appendToASCIIBuffer(codePoint);
590 return;
591 }
592
593 syntaxViolation(iterator);
594
595 if (!U_IS_UNICODE_CHAR(codePoint)) {
596 appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
597 return;
598 }
599
600 uint8_t buffer[U8_MAX_LENGTH];
601 int32_t offset = 0;
602 U8_APPEND_UNSAFE(buffer, offset, codePoint);
603 for (int32_t i = 0; i < offset; ++i) {
604 auto byte = buffer[i];
605 if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
606 percentEncodeByte(byte);
607 else
608 appendToASCIIBuffer(byte);
609 }
610}
611
612template<typename CharacterType>
613void URLParser::encodeNonUTF8Query(const Vector<UChar>& source, const URLTextEncoding& encoding, CodePointIterator<CharacterType> iterator)
614{
615 auto encoded = encoding.encodeForURLParsing(StringView(source.data(), source.size()));
616 auto* data = encoded.data();
617 size_t length = encoded.size();
618
619 if (!length == !iterator.atEnd()) {
620 syntaxViolation(iterator);
621 return;
622 }
623
624 size_t i = 0;
625 for (; i < length; ++i) {
626 ASSERT(!iterator.atEnd());
627 uint8_t byte = data[i];
628 if (UNLIKELY(byte != *iterator)) {
629 syntaxViolation(iterator);
630 break;
631 }
632 if (UNLIKELY(shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))) {
633 syntaxViolation(iterator);
634 break;
635 }
636 appendToASCIIBuffer(byte);
637 ++iterator;
638 }
639 while (!iterator.atEnd() && isTabOrNewline(*iterator))
640 ++iterator;
641 ASSERT((i == length) == iterator.atEnd());
642 for (; i < length; ++i) {
643 ASSERT(m_didSeeSyntaxViolation);
644 uint8_t byte = data[i];
645 if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
646 percentEncodeByte(byte);
647 else
648 appendToASCIIBuffer(byte);
649 }
650}
651
652Optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
653{
654 static const uint16_t ftpPort = 21;
655 static const uint16_t gopherPort = 70;
656 static const uint16_t httpPort = 80;
657 static const uint16_t httpsPort = 443;
658 static const uint16_t wsPort = 80;
659 static const uint16_t wssPort = 443;
660
661 auto length = scheme.length();
662 if (!length)
663 return WTF::nullopt;
664 switch (scheme[0]) {
665 case 'w':
666 switch (length) {
667 case 2:
668 if (scheme[1] == 's')
669 return wsPort;
670 return WTF::nullopt;
671 case 3:
672 if (scheme[1] == 's'
673 && scheme[2] == 's')
674 return wssPort;
675 return WTF::nullopt;
676 default:
677 return false;
678 }
679 case 'h':
680 switch (length) {
681 case 4:
682 if (scheme[1] == 't'
683 && scheme[2] == 't'
684 && scheme[3] == 'p')
685 return httpPort;
686 return WTF::nullopt;
687 case 5:
688 if (scheme[1] == 't'
689 && scheme[2] == 't'
690 && scheme[3] == 'p'
691 && scheme[4] == 's')
692 return httpsPort;
693 return WTF::nullopt;
694 default:
695 return WTF::nullopt;
696 }
697 case 'g':
698 if (length == 6
699 && scheme[1] == 'o'
700 && scheme[2] == 'p'
701 && scheme[3] == 'h'
702 && scheme[4] == 'e'
703 && scheme[5] == 'r')
704 return gopherPort;
705 return WTF::nullopt;
706 case 'f':
707 if (length == 3
708 && scheme[1] == 't'
709 && scheme[2] == 'p')
710 return ftpPort;
711 return WTF::nullopt;
712 default:
713 return WTF::nullopt;
714 }
715}
716
717enum class Scheme {
718 WS,
719 WSS,
720 File,
721 FTP,
722 Gopher,
723 HTTP,
724 HTTPS,
725 NonSpecial
726};
727
728ALWAYS_INLINE static Scheme scheme(StringView scheme)
729{
730 auto length = scheme.length();
731 if (!length)
732 return Scheme::NonSpecial;
733 switch (scheme[0]) {
734 case 'f':
735 switch (length) {
736 case 3:
737 if (scheme[1] == 't'
738 && scheme[2] == 'p')
739 return Scheme::FTP;
740 return Scheme::NonSpecial;
741 case 4:
742 if (scheme[1] == 'i'
743 && scheme[2] == 'l'
744 && scheme[3] == 'e')
745 return Scheme::File;
746 return Scheme::NonSpecial;
747 default:
748 return Scheme::NonSpecial;
749 }
750 case 'g':
751 if (length == 6
752 && scheme[1] == 'o'
753 && scheme[2] == 'p'
754 && scheme[3] == 'h'
755 && scheme[4] == 'e'
756 && scheme[5] == 'r')
757 return Scheme::Gopher;
758 return Scheme::NonSpecial;
759 case 'h':
760 switch (length) {
761 case 4:
762 if (scheme[1] == 't'
763 && scheme[2] == 't'
764 && scheme[3] == 'p')
765 return Scheme::HTTP;
766 return Scheme::NonSpecial;
767 case 5:
768 if (scheme[1] == 't'
769 && scheme[2] == 't'
770 && scheme[3] == 'p'
771 && scheme[4] == 's')
772 return Scheme::HTTPS;
773 return Scheme::NonSpecial;
774 default:
775 return Scheme::NonSpecial;
776 }
777 case 'w':
778 switch (length) {
779 case 2:
780 if (scheme[1] == 's')
781 return Scheme::WS;
782 return Scheme::NonSpecial;
783 case 3:
784 if (scheme[1] == 's'
785 && scheme[2] == 's')
786 return Scheme::WSS;
787 return Scheme::NonSpecial;
788 default:
789 return Scheme::NonSpecial;
790 }
791 default:
792 return Scheme::NonSpecial;
793 }
794}
795
796Optional<String> URLParser::maybeCanonicalizeScheme(const String& scheme)
797{
798 if (scheme.isEmpty())
799 return WTF::nullopt;
800
801 if (!isASCIIAlpha(scheme[0]))
802 return WTF::nullopt;
803
804 for (size_t i = 1; i < scheme.length(); ++i) {
805 if (isASCIIAlphanumeric(scheme[i]) || scheme[i] == '+' || scheme[i] == '-' || scheme[i] == '.')
806 continue;
807 return WTF::nullopt;
808 }
809
810 return scheme.convertToASCIILowercase();
811}
812
813bool URLParser::isSpecialScheme(const String& schemeArg)
814{
815 return scheme(schemeArg) != Scheme::NonSpecial;
816}
817
818enum class URLParser::URLPart {
819 SchemeEnd,
820 UserStart,
821 UserEnd,
822 PasswordEnd,
823 HostEnd,
824 PortEnd,
825 PathAfterLastSlash,
826 PathEnd,
827 QueryEnd,
828};
829
830size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
831{
832 switch (part) {
833 case URLPart::QueryEnd:
834 return url.m_queryEnd;
835 case URLPart::PathEnd:
836 return url.m_pathEnd;
837 case URLPart::PathAfterLastSlash:
838 return url.m_pathAfterLastSlash;
839 case URLPart::PortEnd:
840 return url.m_hostEnd + url.m_portLength;
841 case URLPart::HostEnd:
842 return url.m_hostEnd;
843 case URLPart::PasswordEnd:
844 return url.m_passwordEnd;
845 case URLPart::UserEnd:
846 return url.m_userEnd;
847 case URLPart::UserStart:
848 return url.m_userStart;
849 case URLPart::SchemeEnd:
850 return url.m_schemeEnd;
851 }
852 ASSERT_NOT_REACHED();
853 return 0;
854}
855
856void URLParser::copyASCIIStringUntil(const String& string, size_t length)
857{
858 RELEASE_ASSERT(length <= string.length());
859 if (string.isNull())
860 return;
861 ASSERT(m_asciiBuffer.isEmpty());
862 if (string.is8Bit())
863 appendToASCIIBuffer(string.characters8(), length);
864 else {
865 const UChar* characters = string.characters16();
866 for (size_t i = 0; i < length; ++i) {
867 UChar c = characters[i];
868 ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
869 appendToASCIIBuffer(c);
870 }
871 }
872}
873
874template<typename CharacterType>
875void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, const URLTextEncoding*& nonUTF8QueryEncoding)
876{
877 syntaxViolation(iterator);
878
879 m_asciiBuffer.clear();
880 copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
881 switch (part) {
882 case URLPart::QueryEnd:
883 m_url.m_queryEnd = base.m_queryEnd;
884 FALLTHROUGH;
885 case URLPart::PathEnd:
886 m_url.m_pathEnd = base.m_pathEnd;
887 FALLTHROUGH;
888 case URLPart::PathAfterLastSlash:
889 m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
890 FALLTHROUGH;
891 case URLPart::PortEnd:
892 m_url.m_portLength = base.m_portLength;
893 FALLTHROUGH;
894 case URLPart::HostEnd:
895 m_url.m_hostEnd = base.m_hostEnd;
896 FALLTHROUGH;
897 case URLPart::PasswordEnd:
898 m_url.m_passwordEnd = base.m_passwordEnd;
899 FALLTHROUGH;
900 case URLPart::UserEnd:
901 m_url.m_userEnd = base.m_userEnd;
902 FALLTHROUGH;
903 case URLPart::UserStart:
904 m_url.m_userStart = base.m_userStart;
905 FALLTHROUGH;
906 case URLPart::SchemeEnd:
907 m_url.m_isValid = base.m_isValid;
908 m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
909 m_url.m_schemeEnd = base.m_schemeEnd;
910 }
911 switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) {
912 case Scheme::WS:
913 case Scheme::WSS:
914 nonUTF8QueryEncoding = nullptr;
915 m_urlIsSpecial = true;
916 return;
917 case Scheme::File:
918 m_urlIsFile = true;
919 FALLTHROUGH;
920 case Scheme::FTP:
921 case Scheme::Gopher:
922 case Scheme::HTTP:
923 case Scheme::HTTPS:
924 m_urlIsSpecial = true;
925 return;
926 case Scheme::NonSpecial:
927 m_urlIsSpecial = false;
928 nonUTF8QueryEncoding = nullptr;
929 return;
930 }
931 ASSERT_NOT_REACHED();
932}
933
934static const char dotASCIICode[2] = {'2', 'e'};
935
936template<typename CharacterType>
937ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
938{
939 if (c.atEnd())
940 return false;
941 if (*c == '.') {
942 advance<CharacterType, ReportSyntaxViolation::No>(c);
943 return c.atEnd() || isSlashQuestionOrHash(*c);
944 }
945 if (*c != '%')
946 return false;
947 advance<CharacterType, ReportSyntaxViolation::No>(c);
948 if (c.atEnd() || *c != dotASCIICode[0])
949 return false;
950 advance<CharacterType, ReportSyntaxViolation::No>(c);
951 if (c.atEnd())
952 return false;
953 if (toASCIILower(*c) == dotASCIICode[1]) {
954 advance<CharacterType, ReportSyntaxViolation::No>(c);
955 return c.atEnd() || isSlashQuestionOrHash(*c);
956 }
957 return false;
958}
959
960template<typename CharacterType>
961ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
962{
963 if (c.atEnd())
964 return false;
965 if (*c == '.') {
966 advance<CharacterType, ReportSyntaxViolation::No>(c);
967 return isSingleDotPathSegment(c);
968 }
969 if (*c != '%')
970 return false;
971 advance<CharacterType, ReportSyntaxViolation::No>(c);
972 if (c.atEnd() || *c != dotASCIICode[0])
973 return false;
974 advance<CharacterType, ReportSyntaxViolation::No>(c);
975 if (c.atEnd())
976 return false;
977 if (toASCIILower(*c) == dotASCIICode[1]) {
978 advance<CharacterType, ReportSyntaxViolation::No>(c);
979 return isSingleDotPathSegment(c);
980 }
981 return false;
982}
983
984template<typename CharacterType>
985void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
986{
987 ASSERT(isSingleDotPathSegment(c));
988 if (*c == '.') {
989 advance(c);
990 if (!c.atEnd()) {
991 if (*c == '/' || *c == '\\')
992 advance(c);
993 else
994 ASSERT(*c == '?' || *c == '#');
995 }
996 } else {
997 ASSERT(*c == '%');
998 advance(c);
999 ASSERT(*c == dotASCIICode[0]);
1000 advance(c);
1001 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1002 advance(c);
1003 if (!c.atEnd()) {
1004 if (*c == '/' || *c == '\\')
1005 advance(c);
1006 else
1007 ASSERT(*c == '?' || *c == '#');
1008 }
1009 }
1010}
1011
1012template<typename CharacterType>
1013void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
1014{
1015 ASSERT(isDoubleDotPathSegment(c));
1016 if (*c == '.')
1017 advance(c);
1018 else {
1019 ASSERT(*c == '%');
1020 advance(c);
1021 ASSERT(*c == dotASCIICode[0]);
1022 advance(c);
1023 ASSERT(toASCIILower(*c) == dotASCIICode[1]);
1024 advance(c);
1025 }
1026 consumeSingleDotPathSegment(c);
1027}
1028
1029bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash)
1030{
1031 ASSERT(m_didSeeSyntaxViolation);
1032 if (!m_urlIsFile)
1033 return true;
1034
1035 ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size());
1036 CodePointIterator<LChar> componentToPop(&m_asciiBuffer[newPathAfterLastSlash], &m_asciiBuffer[0] + m_url.m_pathAfterLastSlash);
1037 if (newPathAfterLastSlash == m_url.m_hostEnd + m_url.m_portLength + 1 && isWindowsDriveLetter(componentToPop))
1038 return false;
1039 return true;
1040}
1041
1042void URLParser::popPath()
1043{
1044 ASSERT(m_didSeeSyntaxViolation);
1045 if (m_url.m_pathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength + 1) {
1046 auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - 1;
1047 if (m_asciiBuffer[newPathAfterLastSlash] == '/')
1048 newPathAfterLastSlash--;
1049 while (newPathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength && m_asciiBuffer[newPathAfterLastSlash] != '/')
1050 newPathAfterLastSlash--;
1051 newPathAfterLastSlash++;
1052 if (shouldPopPath(newPathAfterLastSlash))
1053 m_url.m_pathAfterLastSlash = newPathAfterLastSlash;
1054 }
1055 m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1056}
1057
1058template<typename CharacterType>
1059void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1060{
1061 if (m_didSeeSyntaxViolation)
1062 return;
1063 m_didSeeSyntaxViolation = true;
1064
1065 ASSERT(m_asciiBuffer.isEmpty());
1066 size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1067 RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1068 m_asciiBuffer.reserveCapacity(m_inputString.length());
1069 for (size_t i = 0; i < codeUnitsToCopy; ++i) {
1070 ASSERT(isASCII(m_inputString[i]));
1071 m_asciiBuffer.uncheckedAppend(m_inputString[i]);
1072 }
1073}
1074
1075void URLParser::failure()
1076{
1077 m_url.invalidate();
1078 m_url.m_string = m_inputString;
1079}
1080
1081template<typename CharacterType>
1082bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1083{
1084 if (iterator.atEnd() || toASCIILower(*iterator) != codePoint)
1085 return false;
1086 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1087 return true;
1088}
1089
1090template<typename CharacterType>
1091bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1092{
1093 if (!checkLocalhostCodePoint(iterator, 'l'))
1094 return false;
1095 if (!checkLocalhostCodePoint(iterator, 'o'))
1096 return false;
1097 if (!checkLocalhostCodePoint(iterator, 'c'))
1098 return false;
1099 if (!checkLocalhostCodePoint(iterator, 'a'))
1100 return false;
1101 if (!checkLocalhostCodePoint(iterator, 'l'))
1102 return false;
1103 if (!checkLocalhostCodePoint(iterator, 'h'))
1104 return false;
1105 if (!checkLocalhostCodePoint(iterator, 'o'))
1106 return false;
1107 if (!checkLocalhostCodePoint(iterator, 's'))
1108 return false;
1109 if (!checkLocalhostCodePoint(iterator, 't'))
1110 return false;
1111 return iterator.atEnd();
1112}
1113
1114bool URLParser::isLocalhost(StringView view)
1115{
1116 if (view.is8Bit())
1117 return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1118 return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1119}
1120
1121ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1122{
1123 if (UNLIKELY(m_didSeeSyntaxViolation)) {
1124 ASSERT(start + length <= m_asciiBuffer.size());
1125 return StringView(m_asciiBuffer.data() + start, length);
1126 }
1127 ASSERT(start + length <= m_inputString.length());
1128 return StringView(m_inputString).substring(start, length);
1129}
1130
1131ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position)
1132{
1133 if (UNLIKELY(m_didSeeSyntaxViolation))
1134 return m_asciiBuffer[position];
1135 return m_inputString[position];
1136}
1137
1138template<typename CharacterType>
1139ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1140{
1141 if (UNLIKELY(m_didSeeSyntaxViolation))
1142 return m_asciiBuffer.size();
1143
1144 return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1145}
1146
1147URLParser::URLParser(const String& input, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding)
1148 : m_inputString(input)
1149{
1150 if (input.isNull()) {
1151 if (base.isValid() && !base.m_cannotBeABaseURL) {
1152 m_url = base;
1153 m_url.removeFragmentIdentifier();
1154 }
1155 return;
1156 }
1157
1158 if (input.is8Bit()) {
1159 m_inputBegin = input.characters8();
1160 parse(input.characters8(), input.length(), base, nonUTF8QueryEncoding);
1161 } else {
1162 m_inputBegin = input.characters16();
1163 parse(input.characters16(), input.length(), base, nonUTF8QueryEncoding);
1164 }
1165
1166 ASSERT(!m_url.m_isValid
1167 || m_didSeeSyntaxViolation == (m_url.string() != input)
1168 || (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1169 && m_url.m_string == base.m_string.left(base.m_queryEnd)));
1170 ASSERT(internalValuesConsistent(m_url));
1171#if !ASSERT_DISABLED
1172 if (!m_didSeeSyntaxViolation) {
1173 // Force a syntax violation at the beginning to make sure we get the same result.
1174 URLParser parser(makeString(" ", input), base, nonUTF8QueryEncoding);
1175 URL parsed = parser.result();
1176 if (parsed.isValid())
1177 ASSERT(allValuesEqual(parser.result(), m_url));
1178 }
1179#endif
1180}
1181
1182template<typename CharacterType>
1183void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding)
1184{
1185 URL_PARSER_LOG("Parsing URL <%s> base <%s>", String(input, length).utf8().data(), base.string().utf8().data());
1186 m_url = { };
1187 ASSERT(m_asciiBuffer.isEmpty());
1188
1189 Vector<UChar> queryBuffer;
1190
1191 unsigned endIndex = length;
1192 while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) {
1193 syntaxViolation(CodePointIterator<CharacterType>(input, input));
1194 endIndex--;
1195 }
1196 CodePointIterator<CharacterType> c(input, input + endIndex);
1197 CodePointIterator<CharacterType> authorityOrHostBegin;
1198 CodePointIterator<CharacterType> queryBegin;
1199 while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1200 syntaxViolation(c);
1201 ++c;
1202 }
1203 auto beginAfterControlAndSpace = c;
1204
1205 enum class State : uint8_t {
1206 SchemeStart,
1207 Scheme,
1208 NoScheme,
1209 SpecialRelativeOrAuthority,
1210 PathOrAuthority,
1211 Relative,
1212 RelativeSlash,
1213 SpecialAuthoritySlashes,
1214 SpecialAuthorityIgnoreSlashes,
1215 AuthorityOrHost,
1216 Host,
1217 File,
1218 FileSlash,
1219 FileHost,
1220 PathStart,
1221 Path,
1222 CannotBeABaseURLPath,
1223 UTF8Query,
1224 NonUTF8Query,
1225 Fragment,
1226 };
1227
1228#define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1229#define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1230
1231 State state = State::SchemeStart;
1232 while (!c.atEnd()) {
1233 if (UNLIKELY(isTabOrNewline(*c))) {
1234 syntaxViolation(c);
1235 ++c;
1236 continue;
1237 }
1238
1239 switch (state) {
1240 case State::SchemeStart:
1241 LOG_STATE("SchemeStart");
1242 if (isASCIIAlpha(*c)) {
1243 if (UNLIKELY(isASCIIUpper(*c)))
1244 syntaxViolation(c);
1245 appendToASCIIBuffer(toASCIILower(*c));
1246 advance(c);
1247 if (c.atEnd()) {
1248 m_asciiBuffer.clear();
1249 state = State::NoScheme;
1250 c = beginAfterControlAndSpace;
1251 break;
1252 }
1253 state = State::Scheme;
1254 } else
1255 state = State::NoScheme;
1256 break;
1257 case State::Scheme:
1258 LOG_STATE("Scheme");
1259 if (isValidSchemeCharacter(*c)) {
1260 if (UNLIKELY(isASCIIUpper(*c)))
1261 syntaxViolation(c);
1262 appendToASCIIBuffer(toASCIILower(*c));
1263 } else if (*c == ':') {
1264 unsigned schemeEnd = currentPosition(c);
1265 if (schemeEnd > URL::maxSchemeLength) {
1266 failure();
1267 return;
1268 }
1269 m_url.m_schemeEnd = schemeEnd;
1270 StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd);
1271 appendToASCIIBuffer(':');
1272 switch (scheme(urlScheme)) {
1273 case Scheme::File:
1274 m_urlIsSpecial = true;
1275 m_urlIsFile = true;
1276 state = State::File;
1277 ++c;
1278 break;
1279 case Scheme::WS:
1280 case Scheme::WSS:
1281 nonUTF8QueryEncoding = nullptr;
1282 m_urlIsSpecial = true;
1283 if (base.protocolIs(urlScheme))
1284 state = State::SpecialRelativeOrAuthority;
1285 else
1286 state = State::SpecialAuthoritySlashes;
1287 ++c;
1288 break;
1289 case Scheme::HTTP:
1290 case Scheme::HTTPS:
1291 m_url.m_protocolIsInHTTPFamily = true;
1292 FALLTHROUGH;
1293 case Scheme::FTP:
1294 case Scheme::Gopher:
1295 m_urlIsSpecial = true;
1296 if (base.protocolIs(urlScheme))
1297 state = State::SpecialRelativeOrAuthority;
1298 else
1299 state = State::SpecialAuthoritySlashes;
1300 ++c;
1301 break;
1302 case Scheme::NonSpecial:
1303 nonUTF8QueryEncoding = nullptr;
1304 auto maybeSlash = c;
1305 advance(maybeSlash);
1306 if (!maybeSlash.atEnd() && *maybeSlash == '/') {
1307 appendToASCIIBuffer('/');
1308 c = maybeSlash;
1309 state = State::PathOrAuthority;
1310 ASSERT(*c == '/');
1311 ++c;
1312 m_url.m_userStart = currentPosition(c);
1313 } else {
1314 ++c;
1315 m_url.m_userStart = currentPosition(c);
1316 m_url.m_userEnd = m_url.m_userStart;
1317 m_url.m_passwordEnd = m_url.m_userStart;
1318 m_url.m_hostEnd = m_url.m_userStart;
1319 m_url.m_portLength = 0;
1320 m_url.m_pathAfterLastSlash = m_url.m_userStart;
1321 m_url.m_cannotBeABaseURL = true;
1322 state = State::CannotBeABaseURLPath;
1323 }
1324 break;
1325 }
1326 break;
1327 } else {
1328 m_asciiBuffer.clear();
1329 state = State::NoScheme;
1330 c = beginAfterControlAndSpace;
1331 break;
1332 }
1333 advance(c);
1334 if (c.atEnd()) {
1335 m_asciiBuffer.clear();
1336 state = State::NoScheme;
1337 c = beginAfterControlAndSpace;
1338 }
1339 break;
1340 case State::NoScheme:
1341 LOG_STATE("NoScheme");
1342 if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) {
1343 failure();
1344 return;
1345 }
1346 if (base.m_cannotBeABaseURL && *c == '#') {
1347 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1348 state = State::Fragment;
1349 appendToASCIIBuffer('#');
1350 ++c;
1351 break;
1352 }
1353 if (!base.protocolIs("file")) {
1354 state = State::Relative;
1355 break;
1356 }
1357 copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding);
1358 appendToASCIIBuffer(':');
1359 state = State::File;
1360 break;
1361 case State::SpecialRelativeOrAuthority:
1362 LOG_STATE("SpecialRelativeOrAuthority");
1363 if (*c == '/') {
1364 appendToASCIIBuffer('/');
1365 advance(c);
1366 if (c.atEnd()) {
1367 failure();
1368 return;
1369 }
1370 if (*c == '/') {
1371 appendToASCIIBuffer('/');
1372 state = State::SpecialAuthorityIgnoreSlashes;
1373 ++c;
1374 } else
1375 state = State::RelativeSlash;
1376 } else
1377 state = State::Relative;
1378 break;
1379 case State::PathOrAuthority:
1380 LOG_STATE("PathOrAuthority");
1381 if (*c == '/') {
1382 appendToASCIIBuffer('/');
1383 state = State::AuthorityOrHost;
1384 advance(c);
1385 m_url.m_userStart = currentPosition(c);
1386 authorityOrHostBegin = c;
1387 } else {
1388 ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1389 m_url.m_userStart = currentPosition(c) - 1;
1390 m_url.m_userEnd = m_url.m_userStart;
1391 m_url.m_passwordEnd = m_url.m_userStart;
1392 m_url.m_hostEnd = m_url.m_userStart;
1393 m_url.m_portLength = 0;
1394 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1395 state = State::Path;
1396 }
1397 break;
1398 case State::Relative:
1399 LOG_STATE("Relative");
1400 switch (*c) {
1401 case '/':
1402 case '\\':
1403 state = State::RelativeSlash;
1404 ++c;
1405 break;
1406 case '?':
1407 copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding);
1408 appendToASCIIBuffer('?');
1409 ++c;
1410 if (nonUTF8QueryEncoding) {
1411 queryBegin = c;
1412 state = State::NonUTF8Query;
1413 } else
1414 state = State::UTF8Query;
1415 break;
1416 case '#':
1417 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1418 appendToASCIIBuffer('#');
1419 state = State::Fragment;
1420 ++c;
1421 break;
1422 default:
1423 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding);
1424 if (currentPosition(c) && parsedDataView(currentPosition(c) - 1) != '/') {
1425 appendToASCIIBuffer('/');
1426 m_url.m_pathAfterLastSlash = currentPosition(c);
1427 }
1428 state = State::Path;
1429 break;
1430 }
1431 break;
1432 case State::RelativeSlash:
1433 LOG_STATE("RelativeSlash");
1434 if (*c == '/' || *c == '\\') {
1435 ++c;
1436 copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding);
1437 appendToASCIIBuffer("://", 3);
1438 if (m_urlIsSpecial)
1439 state = State::SpecialAuthorityIgnoreSlashes;
1440 else {
1441 m_url.m_userStart = currentPosition(c);
1442 state = State::AuthorityOrHost;
1443 authorityOrHostBegin = c;
1444 }
1445 } else {
1446 copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding);
1447 appendToASCIIBuffer('/');
1448 m_url.m_pathAfterLastSlash = base.m_hostEnd + base.m_portLength + 1;
1449 state = State::Path;
1450 }
1451 break;
1452 case State::SpecialAuthoritySlashes:
1453 LOG_STATE("SpecialAuthoritySlashes");
1454 if (LIKELY(*c == '/' || *c == '\\')) {
1455 if (UNLIKELY(*c == '\\'))
1456 syntaxViolation(c);
1457 appendToASCIIBuffer('/');
1458 advance(c);
1459 if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) {
1460 if (UNLIKELY(*c == '\\'))
1461 syntaxViolation(c);
1462 ++c;
1463 appendToASCIIBuffer('/');
1464 } else {
1465 syntaxViolation(c);
1466 appendToASCIIBuffer('/');
1467 }
1468 } else {
1469 syntaxViolation(c);
1470 appendToASCIIBuffer("//", 2);
1471 }
1472 state = State::SpecialAuthorityIgnoreSlashes;
1473 break;
1474 case State::SpecialAuthorityIgnoreSlashes:
1475 LOG_STATE("SpecialAuthorityIgnoreSlashes");
1476 if (*c == '/' || *c == '\\') {
1477 syntaxViolation(c);
1478 ++c;
1479 } else {
1480 m_url.m_userStart = currentPosition(c);
1481 state = State::AuthorityOrHost;
1482 authorityOrHostBegin = c;
1483 }
1484 break;
1485 case State::AuthorityOrHost:
1486 do {
1487 LOG_STATE("AuthorityOrHost");
1488 if (*c == '@') {
1489 auto lastAt = c;
1490 auto findLastAt = c;
1491 while (!findLastAt.atEnd()) {
1492 URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1493 if (*findLastAt == '@')
1494 lastAt = findLastAt;
1495 bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\');
1496 if (isSlash || *findLastAt == '?' || *findLastAt == '#')
1497 break;
1498 ++findLastAt;
1499 }
1500 parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1501 c = lastAt;
1502 advance(c);
1503 authorityOrHostBegin = c;
1504 state = State::Host;
1505 m_hostHasPercentOrNonASCII = false;
1506 break;
1507 }
1508 bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\');
1509 if (isSlash || *c == '?' || *c == '#') {
1510 auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1511 if (iterator.atEnd()) {
1512 if (m_urlIsSpecial)
1513 return failure();
1514 m_url.m_userEnd = currentPosition(c);
1515 m_url.m_passwordEnd = m_url.m_userEnd;
1516 m_url.m_hostEnd = m_url.m_userEnd;
1517 m_url.m_portLength = 0;
1518 m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1519 } else {
1520 m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1521 m_url.m_passwordEnd = m_url.m_userEnd;
1522 if (!parseHostAndPort(iterator)) {
1523 failure();
1524 return;
1525 }
1526 if (UNLIKELY(!isSlash)) {
1527 if (m_urlIsSpecial) {
1528 syntaxViolation(c);
1529 appendToASCIIBuffer('/');
1530 }
1531 m_url.m_pathAfterLastSlash = currentPosition(c);
1532 }
1533 }
1534 state = State::Path;
1535 break;
1536 }
1537 if (isPercentOrNonASCII(*c))
1538 m_hostHasPercentOrNonASCII = true;
1539 ++c;
1540 } while (!c.atEnd());
1541 break;
1542 case State::Host:
1543 do {
1544 LOG_STATE("Host");
1545 if (*c == '/' || *c == '?' || *c == '#') {
1546 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1547 failure();
1548 return;
1549 }
1550 if (*c == '?' || *c == '#') {
1551 syntaxViolation(c);
1552 appendToASCIIBuffer('/');
1553 m_url.m_pathAfterLastSlash = currentPosition(c);
1554 }
1555 state = State::Path;
1556 break;
1557 }
1558 if (isPercentOrNonASCII(*c))
1559 m_hostHasPercentOrNonASCII = true;
1560 ++c;
1561 } while (!c.atEnd());
1562 break;
1563 case State::File:
1564 LOG_STATE("File");
1565 switch (*c) {
1566 case '\\':
1567 syntaxViolation(c);
1568 FALLTHROUGH;
1569 case '/':
1570 appendToASCIIBuffer('/');
1571 state = State::FileSlash;
1572 ++c;
1573 break;
1574 case '?':
1575 syntaxViolation(c);
1576 if (base.isValid() && base.protocolIs("file")) {
1577 copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding);
1578 appendToASCIIBuffer('?');
1579 ++c;
1580 } else {
1581 appendToASCIIBuffer("///?", 4);
1582 ++c;
1583 m_url.m_userStart = currentPosition(c) - 2;
1584 m_url.m_userEnd = m_url.m_userStart;
1585 m_url.m_passwordEnd = m_url.m_userStart;
1586 m_url.m_hostEnd = m_url.m_userStart;
1587 m_url.m_portLength = 0;
1588 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1589 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1590 }
1591 if (nonUTF8QueryEncoding) {
1592 queryBegin = c;
1593 state = State::NonUTF8Query;
1594 } else
1595 state = State::UTF8Query;
1596 break;
1597 case '#':
1598 syntaxViolation(c);
1599 if (base.isValid() && base.protocolIs("file")) {
1600 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1601 appendToASCIIBuffer('#');
1602 } else {
1603 appendToASCIIBuffer("///#", 4);
1604 m_url.m_userStart = currentPosition(c) - 2;
1605 m_url.m_userEnd = m_url.m_userStart;
1606 m_url.m_passwordEnd = m_url.m_userStart;
1607 m_url.m_hostEnd = m_url.m_userStart;
1608 m_url.m_portLength = 0;
1609 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1610 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1611 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1612 }
1613 state = State::Fragment;
1614 ++c;
1615 break;
1616 default:
1617 syntaxViolation(c);
1618 if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1619 copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding);
1620 else {
1621 appendToASCIIBuffer("///", 3);
1622 m_url.m_userStart = currentPosition(c) - 1;
1623 m_url.m_userEnd = m_url.m_userStart;
1624 m_url.m_passwordEnd = m_url.m_userStart;
1625 m_url.m_hostEnd = m_url.m_userStart;
1626 m_url.m_portLength = 0;
1627 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1628 if (isWindowsDriveLetter(c))
1629 appendWindowsDriveLetter(c);
1630 }
1631 state = State::Path;
1632 break;
1633 }
1634 break;
1635 case State::FileSlash:
1636 LOG_STATE("FileSlash");
1637 if (LIKELY(*c == '/' || *c == '\\')) {
1638 if (UNLIKELY(*c == '\\'))
1639 syntaxViolation(c);
1640 appendToASCIIBuffer('/');
1641 advance(c);
1642 m_url.m_userStart = currentPosition(c);
1643 m_url.m_userEnd = m_url.m_userStart;
1644 m_url.m_passwordEnd = m_url.m_userStart;
1645 m_url.m_hostEnd = m_url.m_userStart;
1646 m_url.m_portLength = 0;
1647 authorityOrHostBegin = c;
1648 state = State::FileHost;
1649 break;
1650 }
1651 syntaxViolation(c);
1652 appendToASCIIBuffer("//", 2);
1653 m_url.m_userStart = currentPosition(c) - 1;
1654 m_url.m_userEnd = m_url.m_userStart;
1655 m_url.m_passwordEnd = m_url.m_userStart;
1656 m_url.m_hostEnd = m_url.m_userStart;
1657 m_url.m_portLength = 0;
1658 if (isWindowsDriveLetter(c)) {
1659 appendWindowsDriveLetter(c);
1660 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1661 } else if (copyBaseWindowsDriveLetter(base)) {
1662 appendToASCIIBuffer('/');
1663 m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1664 } else
1665 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1666 state = State::Path;
1667 break;
1668 case State::FileHost:
1669 do {
1670 LOG_STATE("FileHost");
1671 if (isSlashQuestionOrHash(*c)) {
1672 bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1673 && isWindowsDriveLetter(authorityOrHostBegin);
1674 if (windowsQuirk) {
1675 syntaxViolation(authorityOrHostBegin);
1676 appendToASCIIBuffer('/');
1677 appendWindowsDriveLetter(authorityOrHostBegin);
1678 }
1679 if (windowsQuirk || authorityOrHostBegin == c) {
1680 ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1) == '/');
1681 if (UNLIKELY(*c == '?')) {
1682 syntaxViolation(c);
1683 appendToASCIIBuffer("/?", 2);
1684 ++c;
1685 if (nonUTF8QueryEncoding) {
1686 queryBegin = c;
1687 state = State::NonUTF8Query;
1688 } else
1689 state = State::UTF8Query;
1690 m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1691 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1692 break;
1693 }
1694 if (UNLIKELY(*c == '#')) {
1695 syntaxViolation(c);
1696 appendToASCIIBuffer("/#", 2);
1697 ++c;
1698 m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
1699 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1700 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1701 state = State::Fragment;
1702 break;
1703 }
1704 state = State::Path;
1705 break;
1706 }
1707 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1708 failure();
1709 return;
1710 }
1711 if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1712 syntaxViolation(c);
1713 m_asciiBuffer.shrink(m_url.m_passwordEnd);
1714 m_url.m_hostEnd = currentPosition(c);
1715 m_url.m_portLength = 0;
1716 }
1717
1718 state = State::PathStart;
1719 break;
1720 }
1721 if (isPercentOrNonASCII(*c))
1722 m_hostHasPercentOrNonASCII = true;
1723 ++c;
1724 } while (!c.atEnd());
1725 break;
1726 case State::PathStart:
1727 LOG_STATE("PathStart");
1728 if (*c != '/' && *c != '\\') {
1729 syntaxViolation(c);
1730 appendToASCIIBuffer('/');
1731 }
1732 m_url.m_pathAfterLastSlash = currentPosition(c);
1733 state = State::Path;
1734 break;
1735 case State::Path:
1736 LOG_STATE("Path");
1737 if (*c == '/' || (m_urlIsSpecial && *c == '\\')) {
1738 if (UNLIKELY(m_urlIsSpecial && *c == '\\'))
1739 syntaxViolation(c);
1740 appendToASCIIBuffer('/');
1741 ++c;
1742 m_url.m_pathAfterLastSlash = currentPosition(c);
1743 break;
1744 }
1745 if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1) == '/')) {
1746 if (UNLIKELY(isDoubleDotPathSegment(c))) {
1747 syntaxViolation(c);
1748 consumeDoubleDotPathSegment(c);
1749 popPath();
1750 break;
1751 }
1752 if (UNLIKELY(isSingleDotPathSegment(c))) {
1753 syntaxViolation(c);
1754 consumeSingleDotPathSegment(c);
1755 break;
1756 }
1757 }
1758 if (*c == '?') {
1759 m_url.m_pathEnd = currentPosition(c);
1760 appendToASCIIBuffer('?');
1761 ++c;
1762 if (nonUTF8QueryEncoding) {
1763 queryBegin = c;
1764 state = State::NonUTF8Query;
1765 } else
1766 state = State::UTF8Query;
1767 break;
1768 }
1769 if (*c == '#') {
1770 m_url.m_pathEnd = currentPosition(c);
1771 m_url.m_queryEnd = m_url.m_pathEnd;
1772 state = State::Fragment;
1773 break;
1774 }
1775 utf8PercentEncode<isInDefaultEncodeSet>(c);
1776 ++c;
1777 break;
1778 case State::CannotBeABaseURLPath:
1779 LOG_STATE("CannotBeABaseURLPath");
1780 if (*c == '?') {
1781 m_url.m_pathEnd = currentPosition(c);
1782 appendToASCIIBuffer('?');
1783 ++c;
1784 if (nonUTF8QueryEncoding) {
1785 queryBegin = c;
1786 state = State::NonUTF8Query;
1787 } else
1788 state = State::UTF8Query;
1789 } else if (*c == '#') {
1790 m_url.m_pathEnd = currentPosition(c);
1791 m_url.m_queryEnd = m_url.m_pathEnd;
1792 state = State::Fragment;
1793 } else if (*c == '/') {
1794 appendToASCIIBuffer('/');
1795 ++c;
1796 m_url.m_pathAfterLastSlash = currentPosition(c);
1797 } else {
1798 utf8PercentEncode<isInSimpleEncodeSet>(c);
1799 ++c;
1800 }
1801 break;
1802 case State::UTF8Query:
1803 LOG_STATE("UTF8Query");
1804 ASSERT(queryBegin == CodePointIterator<CharacterType>());
1805 if (*c == '#') {
1806 m_url.m_queryEnd = currentPosition(c);
1807 state = State::Fragment;
1808 break;
1809 }
1810 ASSERT(!nonUTF8QueryEncoding);
1811 utf8QueryEncode(c);
1812 ++c;
1813 break;
1814 case State::NonUTF8Query:
1815 do {
1816 LOG_STATE("NonUTF8Query");
1817 ASSERT(queryBegin != CodePointIterator<CharacterType>());
1818 if (*c == '#') {
1819 encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c));
1820 m_url.m_queryEnd = currentPosition(c);
1821 state = State::Fragment;
1822 break;
1823 }
1824 appendCodePoint(queryBuffer, *c);
1825 advance(c, queryBegin);
1826 } while (!c.atEnd());
1827 break;
1828 case State::Fragment:
1829 URL_PARSER_LOG("State Fragment");
1830 utf8PercentEncode<isInSimpleEncodeSet>(c);
1831 ++c;
1832 break;
1833 }
1834 }
1835
1836 switch (state) {
1837 case State::SchemeStart:
1838 LOG_FINAL_STATE("SchemeStart");
1839 if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1840 m_url = base;
1841 m_url.removeFragmentIdentifier();
1842 return;
1843 }
1844 failure();
1845 return;
1846 case State::Scheme:
1847 LOG_FINAL_STATE("Scheme");
1848 failure();
1849 return;
1850 case State::NoScheme:
1851 LOG_FINAL_STATE("NoScheme");
1852 RELEASE_ASSERT_NOT_REACHED();
1853 case State::SpecialRelativeOrAuthority:
1854 LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1855 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1856 break;
1857 case State::PathOrAuthority:
1858 LOG_FINAL_STATE("PathOrAuthority");
1859 ASSERT(m_url.m_userStart);
1860 ASSERT(m_url.m_userStart == currentPosition(c));
1861 ASSERT(parsedDataView(currentPosition(c) - 1) == '/');
1862 m_url.m_userStart--;
1863 m_url.m_userEnd = m_url.m_userStart;
1864 m_url.m_passwordEnd = m_url.m_userStart;
1865 m_url.m_hostEnd = m_url.m_userStart;
1866 m_url.m_portLength = 0;
1867 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1868 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1869 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1870 break;
1871 case State::Relative:
1872 LOG_FINAL_STATE("Relative");
1873 RELEASE_ASSERT_NOT_REACHED();
1874 case State::RelativeSlash:
1875 LOG_FINAL_STATE("RelativeSlash");
1876 copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding);
1877 appendToASCIIBuffer('/');
1878 m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + 1;
1879 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1880 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1881 break;
1882 case State::SpecialAuthoritySlashes:
1883 LOG_FINAL_STATE("SpecialAuthoritySlashes");
1884 m_url.m_userStart = currentPosition(c);
1885 m_url.m_userEnd = m_url.m_userStart;
1886 m_url.m_passwordEnd = m_url.m_userStart;
1887 m_url.m_hostEnd = m_url.m_userStart;
1888 m_url.m_portLength = 0;
1889 m_url.m_pathAfterLastSlash = m_url.m_userStart;
1890 m_url.m_pathEnd = m_url.m_userStart;
1891 m_url.m_queryEnd = m_url.m_userStart;
1892 break;
1893 case State::SpecialAuthorityIgnoreSlashes:
1894 LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1895 failure();
1896 return;
1897 case State::AuthorityOrHost:
1898 LOG_FINAL_STATE("AuthorityOrHost");
1899 m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1900 m_url.m_passwordEnd = m_url.m_userEnd;
1901 if (authorityOrHostBegin.atEnd()) {
1902 m_url.m_userEnd = m_url.m_userStart;
1903 m_url.m_passwordEnd = m_url.m_userStart;
1904 m_url.m_hostEnd = m_url.m_userStart;
1905 m_url.m_portLength = 0;
1906 m_url.m_pathEnd = m_url.m_userStart;
1907 } else if (!parseHostAndPort(authorityOrHostBegin)) {
1908 failure();
1909 return;
1910 } else {
1911 if (m_urlIsSpecial) {
1912 syntaxViolation(c);
1913 appendToASCIIBuffer('/');
1914 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + 1;
1915 } else
1916 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1917 }
1918 m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1919 m_url.m_queryEnd = m_url.m_pathEnd;
1920 break;
1921 case State::Host:
1922 LOG_FINAL_STATE("Host");
1923 if (!parseHostAndPort(authorityOrHostBegin)) {
1924 failure();
1925 return;
1926 }
1927 if (m_urlIsSpecial) {
1928 syntaxViolation(c);
1929 appendToASCIIBuffer('/');
1930 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + 1;
1931 } else
1932 m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1933 m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1934 m_url.m_queryEnd = m_url.m_pathEnd;
1935 break;
1936 case State::File:
1937 LOG_FINAL_STATE("File");
1938 if (base.isValid() && base.protocolIs("file")) {
1939 copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1940 break;
1941 }
1942 syntaxViolation(c);
1943 appendToASCIIBuffer("///", 3);
1944 m_url.m_userStart = currentPosition(c) - 1;
1945 m_url.m_userEnd = m_url.m_userStart;
1946 m_url.m_passwordEnd = m_url.m_userStart;
1947 m_url.m_hostEnd = m_url.m_userStart;
1948 m_url.m_portLength = 0;
1949 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1950 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1951 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1952 break;
1953 case State::FileSlash:
1954 LOG_FINAL_STATE("FileSlash");
1955 syntaxViolation(c);
1956 m_url.m_userStart = currentPosition(c) + 1;
1957 appendToASCIIBuffer("//", 2);
1958 m_url.m_userEnd = m_url.m_userStart;
1959 m_url.m_passwordEnd = m_url.m_userStart;
1960 m_url.m_hostEnd = m_url.m_userStart;
1961 m_url.m_portLength = 0;
1962 if (copyBaseWindowsDriveLetter(base)) {
1963 appendToASCIIBuffer('/');
1964 m_url.m_pathAfterLastSlash = m_url.m_userStart + 4;
1965 } else
1966 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1967 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1968 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1969 break;
1970 case State::FileHost:
1971 LOG_FINAL_STATE("FileHost");
1972 if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1973 && isWindowsDriveLetter(authorityOrHostBegin)) {
1974 syntaxViolation(authorityOrHostBegin);
1975 appendToASCIIBuffer('/');
1976 appendWindowsDriveLetter(authorityOrHostBegin);
1977 m_url.m_pathAfterLastSlash = currentPosition(c);
1978 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1979 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1980 break;
1981 }
1982
1983 if (authorityOrHostBegin == c) {
1984 syntaxViolation(c);
1985 appendToASCIIBuffer('/');
1986 m_url.m_userStart = currentPosition(c) - 1;
1987 m_url.m_userEnd = m_url.m_userStart;
1988 m_url.m_passwordEnd = m_url.m_userStart;
1989 m_url.m_hostEnd = m_url.m_userStart;
1990 m_url.m_portLength = 0;
1991 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
1992 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1993 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1994 break;
1995 }
1996
1997 if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1998 failure();
1999 return;
2000 }
2001
2002 syntaxViolation(c);
2003 if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
2004 m_asciiBuffer.shrink(m_url.m_passwordEnd);
2005 m_url.m_hostEnd = currentPosition(c);
2006 m_url.m_portLength = 0;
2007 }
2008 appendToASCIIBuffer('/');
2009 m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + 1;
2010 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
2011 m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
2012 break;
2013 case State::PathStart:
2014 LOG_FINAL_STATE("PathStart");
2015 RELEASE_ASSERT_NOT_REACHED();
2016 case State::Path:
2017 LOG_FINAL_STATE("Path");
2018 m_url.m_pathEnd = currentPosition(c);
2019 m_url.m_queryEnd = m_url.m_pathEnd;
2020 break;
2021 case State::CannotBeABaseURLPath:
2022 LOG_FINAL_STATE("CannotBeABaseURLPath");
2023 m_url.m_pathEnd = currentPosition(c);
2024 m_url.m_queryEnd = m_url.m_pathEnd;
2025 break;
2026 case State::UTF8Query:
2027 LOG_FINAL_STATE("UTF8Query");
2028 ASSERT(queryBegin == CodePointIterator<CharacterType>());
2029 m_url.m_queryEnd = currentPosition(c);
2030 break;
2031 case State::NonUTF8Query:
2032 LOG_FINAL_STATE("NonUTF8Query");
2033 ASSERT(queryBegin != CodePointIterator<CharacterType>());
2034 encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c));
2035 m_url.m_queryEnd = currentPosition(c);
2036 break;
2037 case State::Fragment:
2038 LOG_FINAL_STATE("Fragment");
2039 break;
2040 }
2041
2042 if (LIKELY(!m_didSeeSyntaxViolation)) {
2043 m_url.m_string = m_inputString;
2044 ASSERT(m_asciiBuffer.isEmpty());
2045 } else
2046 m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
2047 m_url.m_isValid = true;
2048 URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
2049}
2050
2051template<typename CharacterType>
2052void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
2053{
2054 if (UNLIKELY(iterator.atEnd())) {
2055 syntaxViolation(iterator);
2056 m_url.m_userEnd = currentPosition(iterator);
2057 m_url.m_passwordEnd = m_url.m_userEnd;
2058 return;
2059 }
2060 for (; !iterator.atEnd(); advance(iterator)) {
2061 if (*iterator == ':') {
2062 m_url.m_userEnd = currentPosition(iterator);
2063 auto iteratorAtColon = iterator;
2064 ++iterator;
2065 bool tabOrNewlineAfterColon = false;
2066 while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2067 tabOrNewlineAfterColon = true;
2068 ++iterator;
2069 }
2070 if (UNLIKELY(iterator.atEnd())) {
2071 syntaxViolation(iteratorAtColon);
2072 m_url.m_passwordEnd = m_url.m_userEnd;
2073 if (m_url.m_userEnd > m_url.m_userStart)
2074 appendToASCIIBuffer('@');
2075 return;
2076 }
2077 if (tabOrNewlineAfterColon)
2078 syntaxViolation(iteratorAtColon);
2079 appendToASCIIBuffer(':');
2080 break;
2081 }
2082 utf8PercentEncode<WTF::isInUserInfoEncodeSet>(iterator);
2083 }
2084 for (; !iterator.atEnd(); advance(iterator))
2085 utf8PercentEncode<WTF::isInUserInfoEncodeSet>(iterator);
2086 m_url.m_passwordEnd = currentPosition(iterator);
2087 if (!m_url.m_userEnd)
2088 m_url.m_userEnd = m_url.m_passwordEnd;
2089 appendToASCIIBuffer('@');
2090}
2091
2092template<typename UnsignedIntegerType>
2093void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2094{
2095 LChar buf[sizeof(UnsignedIntegerType) * 3 + 1];
2096 LChar* end = std::end(buf);
2097 LChar* p = end;
2098 do {
2099 *--p = (number % 10) + '0';
2100 number /= 10;
2101 } while (number);
2102 appendToASCIIBuffer(p, end - p);
2103}
2104
2105void URLParser::serializeIPv4(IPv4Address address)
2106{
2107 appendNumberToASCIIBuffer<uint8_t>(address >> 24);
2108 appendToASCIIBuffer('.');
2109 appendNumberToASCIIBuffer<uint8_t>(address >> 16);
2110 appendToASCIIBuffer('.');
2111 appendNumberToASCIIBuffer<uint8_t>(address >> 8);
2112 appendToASCIIBuffer('.');
2113 appendNumberToASCIIBuffer<uint8_t>(address);
2114}
2115
2116static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin)
2117{
2118 size_t end = begin;
2119 for (; end < 8; end++) {
2120 if (address[end])
2121 break;
2122 }
2123 return end - begin;
2124}
2125
2126static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address)
2127{
2128 Optional<size_t> longest;
2129 size_t longestLength = 0;
2130 for (size_t i = 0; i < 8; i++) {
2131 size_t length = zeroSequenceLength(address, i);
2132 if (length) {
2133 if (length > 1 && (!longest || longestLength < length)) {
2134 longest = i;
2135 longestLength = length;
2136 }
2137 i += length;
2138 }
2139 }
2140 return longest;
2141}
2142
2143void URLParser::serializeIPv6Piece(uint16_t piece)
2144{
2145 bool printed = false;
2146 if (auto nibble0 = piece >> 12) {
2147 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2148 printed = true;
2149 }
2150 auto nibble1 = piece >> 8 & 0xF;
2151 if (printed || nibble1) {
2152 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2153 printed = true;
2154 }
2155 auto nibble2 = piece >> 4 & 0xF;
2156 if (printed || nibble2)
2157 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2158 appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF));
2159}
2160
2161void URLParser::serializeIPv6(URLParser::IPv6Address address)
2162{
2163 appendToASCIIBuffer('[');
2164 auto compressPointer = findLongestZeroSequence(address);
2165 for (size_t piece = 0; piece < 8; piece++) {
2166 if (compressPointer && compressPointer.value() == piece) {
2167 ASSERT(!address[piece]);
2168 if (piece)
2169 appendToASCIIBuffer(':');
2170 else
2171 appendToASCIIBuffer("::", 2);
2172 while (piece < 8 && !address[piece])
2173 piece++;
2174 if (piece == 8)
2175 break;
2176 }
2177 serializeIPv6Piece(address[piece]);
2178 if (piece < 7)
2179 appendToASCIIBuffer(':');
2180 }
2181 appendToASCIIBuffer(']');
2182}
2183
2184enum class URLParser::IPv4PieceParsingError {
2185 Failure,
2186 Overflow,
2187};
2188
2189template<typename CharacterType>
2190Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2191{
2192 enum class State : uint8_t {
2193 UnknownBase,
2194 Decimal,
2195 OctalOrHex,
2196 Octal,
2197 Hex,
2198 };
2199 State state = State::UnknownBase;
2200 Checked<uint32_t, RecordOverflow> value = 0;
2201 if (!iterator.atEnd() && *iterator == '.')
2202 return makeUnexpected(IPv4PieceParsingError::Failure);
2203 while (!iterator.atEnd()) {
2204 if (isTabOrNewline(*iterator)) {
2205 didSeeSyntaxViolation = true;
2206 ++iterator;
2207 continue;
2208 }
2209 if (*iterator == '.') {
2210 ASSERT(!value.hasOverflowed());
2211 return value.unsafeGet();
2212 }
2213 switch (state) {
2214 case State::UnknownBase:
2215 if (UNLIKELY(*iterator == '0')) {
2216 ++iterator;
2217 state = State::OctalOrHex;
2218 break;
2219 }
2220 state = State::Decimal;
2221 break;
2222 case State::OctalOrHex:
2223 didSeeSyntaxViolation = true;
2224 if (*iterator == 'x' || *iterator == 'X') {
2225 ++iterator;
2226 state = State::Hex;
2227 break;
2228 }
2229 state = State::Octal;
2230 break;
2231 case State::Decimal:
2232 if (!isASCIIDigit(*iterator))
2233 return makeUnexpected(IPv4PieceParsingError::Failure);
2234 value *= 10;
2235 value += *iterator - '0';
2236 if (UNLIKELY(value.hasOverflowed()))
2237 return makeUnexpected(IPv4PieceParsingError::Overflow);
2238 ++iterator;
2239 break;
2240 case State::Octal:
2241 ASSERT(didSeeSyntaxViolation);
2242 if (*iterator < '0' || *iterator > '7')
2243 return makeUnexpected(IPv4PieceParsingError::Failure);
2244 value *= 8;
2245 value += *iterator - '0';
2246 if (UNLIKELY(value.hasOverflowed()))
2247 return makeUnexpected(IPv4PieceParsingError::Overflow);
2248 ++iterator;
2249 break;
2250 case State::Hex:
2251 ASSERT(didSeeSyntaxViolation);
2252 if (!isASCIIHexDigit(*iterator))
2253 return makeUnexpected(IPv4PieceParsingError::Failure);
2254 value *= 16;
2255 value += toASCIIHexValue(*iterator);
2256 if (UNLIKELY(value.hasOverflowed()))
2257 return makeUnexpected(IPv4PieceParsingError::Overflow);
2258 ++iterator;
2259 break;
2260 }
2261 }
2262 ASSERT(!value.hasOverflowed());
2263 return value.unsafeGet();
2264}
2265
2266ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2267{
2268 RELEASE_ASSERT(exponent <= 4);
2269 uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 };
2270 return values[exponent];
2271}
2272
2273enum class URLParser::IPv4ParsingError {
2274 Failure,
2275 NotIPv4,
2276};
2277
2278template<typename CharacterTypeForSyntaxViolation, typename CharacterType>
2279Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator)
2280{
2281 Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, 4> items;
2282 bool didSeeSyntaxViolation = false;
2283 if (!iterator.atEnd() && *iterator == '.')
2284 return makeUnexpected(IPv4ParsingError::NotIPv4);
2285 while (!iterator.atEnd()) {
2286 if (isTabOrNewline(*iterator)) {
2287 didSeeSyntaxViolation = true;
2288 ++iterator;
2289 continue;
2290 }
2291 if (items.size() >= 4)
2292 return makeUnexpected(IPv4ParsingError::NotIPv4);
2293 items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation));
2294 if (!iterator.atEnd() && *iterator == '.') {
2295 ++iterator;
2296 if (iterator.atEnd())
2297 syntaxViolation(iteratorForSyntaxViolationPosition);
2298 else if (*iterator == '.')
2299 return makeUnexpected(IPv4ParsingError::NotIPv4);
2300 }
2301 }
2302 if (!iterator.atEnd() || !items.size() || items.size() > 4)
2303 return makeUnexpected(IPv4ParsingError::NotIPv4);
2304 for (const auto& item : items) {
2305 if (!item.has_value() && item.error() == IPv4PieceParsingError::Failure)
2306 return makeUnexpected(IPv4ParsingError::NotIPv4);
2307 }
2308 for (const auto& item : items) {
2309 if (!item.has_value() && item.error() == IPv4PieceParsingError::Overflow)
2310 return makeUnexpected(IPv4ParsingError::Failure);
2311 }
2312 if (items.size() > 1) {
2313 for (size_t i = 0; i < items.size() - 1; i++) {
2314 if (items[i].value() > 255)
2315 return makeUnexpected(IPv4ParsingError::Failure);
2316 }
2317 }
2318 if (items[items.size() - 1].value() >= pow256(5 - items.size()))
2319 return makeUnexpected(IPv4ParsingError::Failure);
2320
2321 if (didSeeSyntaxViolation)
2322 syntaxViolation(iteratorForSyntaxViolationPosition);
2323 for (const auto& item : items) {
2324 if (item.value() > 255)
2325 syntaxViolation(iteratorForSyntaxViolationPosition);
2326 }
2327
2328 if (UNLIKELY(items.size() != 4))
2329 syntaxViolation(iteratorForSyntaxViolationPosition);
2330
2331 IPv4Address ipv4 = items.takeLast().value();
2332 for (size_t counter = 0; counter < items.size(); ++counter)
2333 ipv4 += items[counter].value() * pow256(3 - counter);
2334 return ipv4;
2335}
2336
2337template<typename CharacterType>
2338Optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2339{
2340 if (iterator.atEnd())
2341 return WTF::nullopt;
2342 uint32_t piece = 0;
2343 bool leadingZeros = false;
2344 size_t digitCount = 0;
2345 while (!iterator.atEnd()) {
2346 if (!isASCIIDigit(*iterator))
2347 return WTF::nullopt;
2348 ++digitCount;
2349 if (!piece && *iterator == '0') {
2350 if (leadingZeros)
2351 return WTF::nullopt;
2352 leadingZeros = true;
2353 }
2354 if (!piece && *iterator == '0')
2355 leadingZeros = true;
2356 piece = piece * 10 + *iterator - '0';
2357 if (piece > 255)
2358 return WTF::nullopt;
2359 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2360 if (iterator.atEnd())
2361 break;
2362 if (*iterator == '.')
2363 break;
2364 }
2365 if (piece && leadingZeros)
2366 return WTF::nullopt;
2367 return piece;
2368}
2369
2370template<typename CharacterType>
2371Optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2372{
2373 IPv4Address address = 0;
2374 for (size_t i = 0; i < 4; ++i) {
2375 if (Optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2376 address = (address << 8) + piece.value();
2377 else
2378 return WTF::nullopt;
2379 if (i < 3) {
2380 if (iterator.atEnd())
2381 return WTF::nullopt;
2382 if (*iterator != '.')
2383 return WTF::nullopt;
2384 advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2385 } else if (!iterator.atEnd())
2386 return WTF::nullopt;
2387 }
2388 ASSERT(iterator.atEnd());
2389 return address;
2390}
2391
2392template<typename CharacterType>
2393Optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2394{
2395 ASSERT(*c == '[');
2396 const auto hostBegin = c;
2397 advance(c, hostBegin);
2398 if (c.atEnd())
2399 return WTF::nullopt;
2400
2401 IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}};
2402 size_t piecePointer = 0;
2403 Optional<size_t> compressPointer;
2404 bool previousValueWasZero = false;
2405 bool immediatelyAfterCompress = false;
2406
2407 if (*c == ':') {
2408 advance(c, hostBegin);
2409 if (c.atEnd())
2410 return WTF::nullopt;
2411 if (*c != ':')
2412 return WTF::nullopt;
2413 advance(c, hostBegin);
2414 ++piecePointer;
2415 compressPointer = piecePointer;
2416 immediatelyAfterCompress = true;
2417 }
2418
2419 while (!c.atEnd()) {
2420 if (piecePointer == 8)
2421 return WTF::nullopt;
2422 if (*c == ':') {
2423 if (compressPointer)
2424 return WTF::nullopt;
2425 advance(c, hostBegin);
2426 ++piecePointer;
2427 compressPointer = piecePointer;
2428 immediatelyAfterCompress = true;
2429 if (previousValueWasZero)
2430 syntaxViolation(hostBegin);
2431 continue;
2432 }
2433 if (piecePointer == 6 || (compressPointer && piecePointer < 6)) {
2434 if (Optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2435 if (compressPointer && piecePointer == 5)
2436 return WTF::nullopt;
2437 syntaxViolation(hostBegin);
2438 address[piecePointer++] = ipv4Address.value() >> 16;
2439 address[piecePointer++] = ipv4Address.value() & 0xFFFF;
2440 c = { };
2441 break;
2442 }
2443 }
2444 uint16_t value = 0;
2445 size_t length = 0;
2446 bool leadingZeros = false;
2447 for (; length < 4; length++) {
2448 if (c.atEnd())
2449 break;
2450 if (!isASCIIHexDigit(*c))
2451 break;
2452 if (isASCIIUpper(*c))
2453 syntaxViolation(hostBegin);
2454 if (*c == '0' && !length)
2455 leadingZeros = true;
2456 value = value * 0x10 + toASCIIHexValue(*c);
2457 advance(c, hostBegin);
2458 }
2459
2460 previousValueWasZero = !value;
2461 if (UNLIKELY((value && leadingZeros) || (previousValueWasZero && (length > 1 || immediatelyAfterCompress))))
2462 syntaxViolation(hostBegin);
2463
2464 address[piecePointer++] = value;
2465 if (c.atEnd())
2466 break;
2467 if (piecePointer == 8 || *c != ':')
2468 return WTF::nullopt;
2469 advance(c, hostBegin);
2470
2471 immediatelyAfterCompress = false;
2472 }
2473
2474 if (!c.atEnd())
2475 return WTF::nullopt;
2476
2477 if (compressPointer) {
2478 size_t swaps = piecePointer - compressPointer.value();
2479 piecePointer = 7;
2480 while (swaps)
2481 std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]);
2482 } else if (piecePointer != 8)
2483 return WTF::nullopt;
2484
2485 Optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2486 if (possibleCompressPointer)
2487 possibleCompressPointer.value()++;
2488 if (UNLIKELY(compressPointer != possibleCompressPointer))
2489 syntaxViolation(hostBegin);
2490
2491 return address;
2492}
2493
2494template<typename CharacterType>
2495URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2496{
2497 LCharBuffer output;
2498 output.reserveInitialCapacity(length);
2499
2500 for (size_t i = 0; i < length; ++i) {
2501 uint8_t byte = input[i];
2502 if (byte != '%')
2503 output.uncheckedAppend(byte);
2504 else if (length > 2 && i < length - 2) {
2505 if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2506 syntaxViolation(iteratorForSyntaxViolationPosition);
2507 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2508 i += 2;
2509 } else
2510 output.uncheckedAppend(byte);
2511 } else
2512 output.uncheckedAppend(byte);
2513 }
2514 return output;
2515}
2516
2517URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length)
2518{
2519 LCharBuffer output;
2520 output.reserveInitialCapacity(length);
2521
2522 for (size_t i = 0; i < length; ++i) {
2523 uint8_t byte = input[i];
2524 if (byte != '%')
2525 output.uncheckedAppend(byte);
2526 else if (length > 2 && i < length - 2) {
2527 if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
2528 output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
2529 i += 2;
2530 } else
2531 output.uncheckedAppend(byte);
2532 } else
2533 output.uncheckedAppend(byte);
2534 }
2535 return output;
2536}
2537
2538template<typename CharacterType> Optional<URLParser::LCharBuffer> URLParser::domainToASCII(StringImpl& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2539{
2540 LCharBuffer ascii;
2541 if (domain.isAllASCII()) {
2542 size_t length = domain.length();
2543 if (domain.is8Bit()) {
2544 const LChar* characters = domain.characters8();
2545 ascii.reserveInitialCapacity(length);
2546 for (size_t i = 0; i < length; ++i) {
2547 if (UNLIKELY(isASCIIUpper(characters[i])))
2548 syntaxViolation(iteratorForSyntaxViolationPosition);
2549 ascii.uncheckedAppend(toASCIILower(characters[i]));
2550 }
2551 } else {
2552 const UChar* characters = domain.characters16();
2553 ascii.reserveInitialCapacity(length);
2554 for (size_t i = 0; i < length; ++i) {
2555 if (UNLIKELY(isASCIIUpper(characters[i])))
2556 syntaxViolation(iteratorForSyntaxViolationPosition);
2557 ascii.uncheckedAppend(toASCIILower(characters[i]));
2558 }
2559 }
2560 return ascii;
2561 }
2562
2563 const size_t maxDomainLength = 64;
2564 UChar hostnameBuffer[maxDomainLength];
2565 UErrorCode error = U_ZERO_ERROR;
2566 UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
2567 int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, maxDomainLength, &processingDetails, &error);
2568 ASSERT(numCharactersConverted <= static_cast<int32_t>(maxDomainLength));
2569
2570 if (U_SUCCESS(error) && !processingDetails.errors) {
2571 for (int32_t i = 0; i < numCharactersConverted; ++i) {
2572 ASSERT(isASCII(hostnameBuffer[i]));
2573 ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2574 }
2575 ascii.append(hostnameBuffer, numCharactersConverted);
2576 if (domain != StringView(ascii.data(), ascii.size()))
2577 syntaxViolation(iteratorForSyntaxViolationPosition);
2578 return ascii;
2579 }
2580 return WTF::nullopt;
2581}
2582
2583bool URLParser::hasForbiddenHostCodePoint(const URLParser::LCharBuffer& asciiDomain)
2584{
2585 for (size_t i = 0; i < asciiDomain.size(); ++i) {
2586 if (isForbiddenHostCodePoint(asciiDomain[i]))
2587 return true;
2588 }
2589 return false;
2590}
2591
2592template<typename CharacterType>
2593bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2594{
2595 ASSERT(*iterator == ':');
2596 auto colonIterator = iterator;
2597 advance(iterator, colonIterator);
2598 uint32_t port = 0;
2599 if (UNLIKELY(iterator.atEnd())) {
2600 unsigned portLength = currentPosition(colonIterator) - m_url.m_hostEnd;
2601 RELEASE_ASSERT(portLength <= URL::maxPortLength);
2602 m_url.m_portLength = portLength;
2603 syntaxViolation(colonIterator);
2604 return true;
2605 }
2606 size_t digitCount = 0;
2607 bool leadingZeros = false;
2608 for (; !iterator.atEnd(); ++iterator) {
2609 if (UNLIKELY(isTabOrNewline(*iterator))) {
2610 syntaxViolation(colonIterator);
2611 continue;
2612 }
2613 if (isASCIIDigit(*iterator)) {
2614 if (*iterator == '0' && !digitCount)
2615 leadingZeros = true;
2616 ++digitCount;
2617 port = port * 10 + *iterator - '0';
2618 if (port > std::numeric_limits<uint16_t>::max())
2619 return false;
2620 } else
2621 return false;
2622 }
2623
2624 if (port && leadingZeros)
2625 syntaxViolation(colonIterator);
2626
2627 if (!port && digitCount > 1)
2628 syntaxViolation(colonIterator);
2629
2630 ASSERT(port == static_cast<uint16_t>(port));
2631 if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2632 syntaxViolation(colonIterator);
2633 else {
2634 appendToASCIIBuffer(':');
2635 ASSERT(port <= std::numeric_limits<uint16_t>::max());
2636 appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2637 }
2638
2639 unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2640 RELEASE_ASSERT(portLength <= URL::maxPortLength);
2641 m_url.m_portLength = portLength;
2642 return true;
2643}
2644
2645template<typename CharacterType>
2646bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2647{
2648 if (iterator.atEnd())
2649 return false;
2650 if (*iterator == ':')
2651 return false;
2652 if (*iterator == '[') {
2653 auto ipv6End = iterator;
2654 while (!ipv6End.atEnd() && *ipv6End != ']')
2655 ++ipv6End;
2656 if (ipv6End.atEnd())
2657 return false;
2658 if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2659 serializeIPv6(address.value());
2660 if (!ipv6End.atEnd()) {
2661 advance(ipv6End);
2662 m_url.m_hostEnd = currentPosition(ipv6End);
2663 if (!ipv6End.atEnd() && *ipv6End == ':')
2664 return parsePort(ipv6End);
2665 m_url.m_portLength = 0;
2666 return ipv6End.atEnd();
2667 }
2668 m_url.m_hostEnd = currentPosition(ipv6End);
2669 return true;
2670 }
2671 return false;
2672 }
2673
2674 if (!m_urlIsSpecial) {
2675 for (; !iterator.atEnd(); ++iterator) {
2676 if (UNLIKELY(isTabOrNewline(*iterator))) {
2677 syntaxViolation(iterator);
2678 continue;
2679 }
2680 if (*iterator == ':')
2681 break;
2682 if (UNLIKELY(isForbiddenHostCodePoint(*iterator) && *iterator != '%'))
2683 return false;
2684 utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2685 }
2686 m_url.m_hostEnd = currentPosition(iterator);
2687 if (iterator.atEnd()) {
2688 m_url.m_portLength = 0;
2689 return true;
2690 }
2691 return parsePort(iterator);
2692 }
2693
2694 if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2695 auto hostIterator = iterator;
2696 for (; !iterator.atEnd(); ++iterator) {
2697 if (isTabOrNewline(*iterator))
2698 continue;
2699 if (*iterator == ':')
2700 break;
2701 if (isForbiddenHostCodePoint(*iterator))
2702 return false;
2703 }
2704 auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator));
2705 if (address) {
2706 serializeIPv4(address.value());
2707 m_url.m_hostEnd = currentPosition(iterator);
2708 if (iterator.atEnd()) {
2709 m_url.m_portLength = 0;
2710 return true;
2711 }
2712 return parsePort(iterator);
2713 }
2714 if (address.error() == IPv4ParsingError::Failure)
2715 return false;
2716 for (; hostIterator != iterator; ++hostIterator) {
2717 if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2718 syntaxViolation(hostIterator);
2719 continue;
2720 }
2721 if (UNLIKELY(isASCIIUpper(*hostIterator)))
2722 syntaxViolation(hostIterator);
2723 appendToASCIIBuffer(toASCIILower(*hostIterator));
2724 }
2725 m_url.m_hostEnd = currentPosition(iterator);
2726 if (!hostIterator.atEnd())
2727 return parsePort(hostIterator);
2728 unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2729 RELEASE_ASSERT(portLength <= URL::maxPortLength);
2730 m_url.m_portLength = portLength;
2731 return true;
2732 }
2733
2734 const auto hostBegin = iterator;
2735
2736 LCharBuffer utf8Encoded;
2737 for (; !iterator.atEnd(); ++iterator) {
2738 if (UNLIKELY(isTabOrNewline(*iterator))) {
2739 syntaxViolation(hostBegin);
2740 continue;
2741 }
2742 if (*iterator == ':')
2743 break;
2744 if (UNLIKELY(!isASCII(*iterator)))
2745 syntaxViolation(hostBegin);
2746
2747 if (!U_IS_UNICODE_CHAR(*iterator))
2748 return false;
2749 uint8_t buffer[U8_MAX_LENGTH];
2750 int32_t offset = 0;
2751 U8_APPEND_UNSAFE(buffer, offset, *iterator);
2752 utf8Encoded.append(buffer, offset);
2753 }
2754 LCharBuffer percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2755 String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2756 if (domain.isNull())
2757 return false;
2758 if (domain != StringView(percentDecoded.data(), percentDecoded.size()))
2759 syntaxViolation(hostBegin);
2760 auto asciiDomain = domainToASCII(*domain.impl(), hostBegin);
2761 if (!asciiDomain || hasForbiddenHostCodePoint(asciiDomain.value()))
2762 return false;
2763 LCharBuffer& asciiDomainValue = asciiDomain.value();
2764 const LChar* asciiDomainCharacters = asciiDomainValue.data();
2765
2766 auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()));
2767 if (address) {
2768 serializeIPv4(address.value());
2769 m_url.m_hostEnd = currentPosition(iterator);
2770 if (iterator.atEnd()) {
2771 m_url.m_portLength = 0;
2772 return true;
2773 }
2774 return parsePort(iterator);
2775 }
2776 if (address.error() == IPv4ParsingError::Failure)
2777 return false;
2778
2779 appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2780 m_url.m_hostEnd = currentPosition(iterator);
2781 if (!iterator.atEnd())
2782 return parsePort(iterator);
2783 m_url.m_portLength = 0;
2784 return true;
2785}
2786
2787Optional<String> URLParser::formURLDecode(StringView input)
2788{
2789 auto utf8 = input.utf8(StrictConversion);
2790 if (utf8.isNull())
2791 return WTF::nullopt;
2792 auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2793 return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2794}
2795
2796// https://url.spec.whatwg.org/#concept-urlencoded-parser
2797auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2798{
2799 URLEncodedForm output;
2800 for (StringView bytes : input.split('&')) {
2801 auto equalIndex = bytes.find('=');
2802 if (equalIndex == notFound) {
2803 auto name = formURLDecode(bytes.toString().replace('+', 0x20));
2804 if (name)
2805 output.append({ name.value(), emptyString() });
2806 } else {
2807 auto name = formURLDecode(bytes.substring(0, equalIndex).toString().replace('+', 0x20));
2808 auto value = formURLDecode(bytes.substring(equalIndex + 1).toString().replace('+', 0x20));
2809 if (name && value)
2810 output.append({ name.value(), value.value() });
2811 }
2812 }
2813 return output;
2814}
2815
2816static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2817{
2818 auto utf8 = input.utf8(StrictConversion);
2819 const char* data = utf8.data();
2820 for (size_t i = 0; i < utf8.length(); ++i) {
2821 const char byte = data[i];
2822 if (byte == 0x20)
2823 output.append(0x2B);
2824 else if (byte == 0x2A
2825 || byte == 0x2D
2826 || byte == 0x2E
2827 || (byte >= 0x30 && byte <= 0x39)
2828 || (byte >= 0x41 && byte <= 0x5A)
2829 || byte == 0x5F
2830 || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches.
2831 output.append(byte);
2832 else
2833 percentEncodeByte(byte, output);
2834 }
2835}
2836
2837String URLParser::serialize(const URLEncodedForm& tuples)
2838{
2839 if (tuples.isEmpty())
2840 return { };
2841
2842 Vector<LChar> output;
2843 for (auto& tuple : tuples) {
2844 if (!output.isEmpty())
2845 output.append('&');
2846 serializeURLEncodedForm(tuple.key, output);
2847 output.append('=');
2848 serializeURLEncodedForm(tuple.value, output);
2849 }
2850 return String::adopt(WTFMove(output));
2851}
2852
2853const UIDNA& URLParser::internationalDomainNameTranscoder()
2854{
2855 static UIDNA* encoder;
2856 static std::once_flag onceFlag;
2857 std::call_once(onceFlag, [] {
2858 UErrorCode error = U_ZERO_ERROR;
2859 encoder = uidna_openUTS46(UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_UNICODE | UIDNA_NONTRANSITIONAL_TO_ASCII, &error);
2860 RELEASE_ASSERT(U_SUCCESS(error));
2861 RELEASE_ASSERT(encoder);
2862 });
2863 return *encoder;
2864}
2865
2866bool URLParser::allValuesEqual(const URL& a, const URL& b)
2867{
2868 URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2869 a.m_isValid,
2870 a.m_cannotBeABaseURL,
2871 a.m_protocolIsInHTTPFamily,
2872 a.m_schemeEnd,
2873 a.m_userStart,
2874 a.m_userEnd,
2875 a.m_passwordEnd,
2876 a.m_hostEnd,
2877 a.m_hostEnd + a.m_portLength,
2878 a.m_pathAfterLastSlash,
2879 a.m_pathEnd,
2880 a.m_queryEnd,
2881 a.m_string.utf8().data(),
2882 b.m_isValid,
2883 b.m_cannotBeABaseURL,
2884 b.m_protocolIsInHTTPFamily,
2885 b.m_schemeEnd,
2886 b.m_userStart,
2887 b.m_userEnd,
2888 b.m_passwordEnd,
2889 b.m_hostEnd,
2890 b.m_hostEnd + b.m_portLength,
2891 b.m_pathAfterLastSlash,
2892 b.m_pathEnd,
2893 b.m_queryEnd,
2894 b.m_string.utf8().data());
2895
2896 return a.m_string == b.m_string
2897 && a.m_isValid == b.m_isValid
2898 && a.m_cannotBeABaseURL == b.m_cannotBeABaseURL
2899 && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2900 && a.m_schemeEnd == b.m_schemeEnd
2901 && a.m_userStart == b.m_userStart
2902 && a.m_userEnd == b.m_userEnd
2903 && a.m_passwordEnd == b.m_passwordEnd
2904 && a.m_hostEnd == b.m_hostEnd
2905 && a.m_portLength == b.m_portLength
2906 && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2907 && a.m_pathEnd == b.m_pathEnd
2908 && a.m_queryEnd == b.m_queryEnd;
2909}
2910
2911bool URLParser::internalValuesConsistent(const URL& url)
2912{
2913 return url.m_schemeEnd <= url.m_userStart
2914 && url.m_userStart <= url.m_userEnd
2915 && url.m_userEnd <= url.m_passwordEnd
2916 && url.m_passwordEnd <= url.m_hostEnd
2917 && url.m_hostEnd + url.m_portLength <= url.m_pathAfterLastSlash
2918 && url.m_pathAfterLastSlash <= url.m_pathEnd
2919 && url.m_pathEnd <= url.m_queryEnd
2920 && url.m_queryEnd <= url.m_string.length();
2921}
2922
2923} // namespace WTF
2924