URLParser.cpp source code [jsc/Source/WTF/wtf/URLParser.cpp]

1	/*
2	* Copyright (C) 2016-2019 Apple Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15	* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23	* THE POSSIBILITY OF SUCH DAMAGE.
24	*/
25
26	#include "config.h"
27	#include <wtf/URLParser.h>
28
29	#include <array>
30	#include <mutex>
31	#include <unicode/uidna.h>
32	#include <unicode/utf8.h>
33	#include <unicode/utypes.h>
34
35	namespace WTF {
36
37	#define URL_PARSER_DEBUGGING 0
38
39	#if URL_PARSER_DEBUGGING
40	#define URL_PARSER_LOG(...) WTFLogAlways(__VA_ARGS__)
41	#else
42	#define URL_PARSER_LOG(...)
43	#endif
44
45	template<typename CharacterType>
46	class CodePointIterator {
47	WTF_MAKE_FAST_ALLOCATED;
48	public:
49	ALWAYS_INLINE CodePointIterator() { }
50	ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end)
51	: m_begin(begin)
52	, m_end(end)
53	{
54	}
55
56	ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end)
57	: CodePointIterator(begin.m_begin, end.m_begin)
58	{
59	ASSERT(end.m_begin >= begin.m_begin);
60	}
61
62	ALWAYS_INLINE UChar32 operator() const*;
63	ALWAYS_INLINE CodePointIterator& operator++();
64
65	ALWAYS_INLINE bool operator==(const CodePointIterator& other) const
66	{
67	return m_begin == other.m_begin
68	&& m_end == other.m_end;
69	}
70	ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
71
72	ALWAYS_INLINE bool atEnd() const
73	{
74	ASSERT(m_begin <= m_end);
75	return m_begin >= m_end;
76	}
77
78	ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const
79	{
80	ASSERT(m_begin >= reference);
81	return m_begin - reference;
82	}
83
84	ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const
85	{
86	return codeUnitsSince(other.m_begin);
87	}
88
89	private:
90	const CharacterType* m_begin { nullptr };
91	const CharacterType* m_end { nullptr };
92	};
93
94	template<>
95	ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator() const*
96	{
97	ASSERT(!atEnd());
98	return *m_begin;
99	}
100
101	template<>
102	ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator&
103	{
104	m_begin++;
105	return *this;
106	}
107
108	template<>
109	ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator() const*
110	{
111	ASSERT(!atEnd());
112	UChar32 c;
113	U16_GET(m_begin, `0`, `0`, m_end - m_begin, c);
114	return c;
115	}
116
117	template<>
118	ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
119	{
120	unsigned i = `0`;
121	size_t length = m_end - m_begin;
122	U16_FWD_1(m_begin, i, length);
123	m_begin += i;
124	return *this;
125	}
126
127	ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
128	{
129	if (U_IS_BMP(codePoint)) {
130	destination.append(static_cast<UChar>(codePoint));
131	return;
132	}
133	destination.reserveCapacity(destination.size() + `2`);
134	destination.uncheckedAppend(U16_LEAD(codePoint));
135	destination.uncheckedAppend(U16_TRAIL(codePoint));
136	}
137
138	enum URLCharacterClass {
139	UserInfo = `0x1`,
140	Default = `0x2`,
141	ForbiddenHost = `0x4`,
142	QueryPercent = `0x8`,
143	SlashQuestionOrHash = `0x10`,
144	ValidScheme = `0x20`,
145	};
146
147	static const uint8_t characterClassTable[`256`] = {
148	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // 0x0
149	UserInfo \| Default \| QueryPercent, // 0x1
150	UserInfo \| Default \| QueryPercent, // 0x2
151	UserInfo \| Default \| QueryPercent, // 0x3
152	UserInfo \| Default \| QueryPercent, // 0x4
153	UserInfo \| Default \| QueryPercent, // 0x5
154	UserInfo \| Default \| QueryPercent, // 0x6
155	UserInfo \| Default \| QueryPercent, // 0x7
156	UserInfo \| Default \| QueryPercent, // 0x8
157	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // 0x9
158	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // 0xA
159	UserInfo \| Default \| QueryPercent, // 0xB
160	UserInfo \| Default \| QueryPercent, // 0xC
161	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // 0xD
162	UserInfo \| Default \| QueryPercent, // 0xE
163	UserInfo \| Default \| QueryPercent, // 0xF
164	UserInfo \| Default \| QueryPercent, // 0x10
165	UserInfo \| Default \| QueryPercent, // 0x11
166	UserInfo \| Default \| QueryPercent, // 0x12
167	UserInfo \| Default \| QueryPercent, // 0x13
168	UserInfo \| Default \| QueryPercent, // 0x14
169	UserInfo \| Default \| QueryPercent, // 0x15
170	UserInfo \| Default \| QueryPercent, // 0x16
171	UserInfo \| Default \| QueryPercent, // 0x17
172	UserInfo \| Default \| QueryPercent, // 0x18
173	UserInfo \| Default \| QueryPercent, // 0x19
174	UserInfo \| Default \| QueryPercent, // 0x1A
175	UserInfo \| Default \| QueryPercent, // 0x1B
176	UserInfo \| Default \| QueryPercent, // 0x1C
177	UserInfo \| Default \| QueryPercent, // 0x1D
178	UserInfo \| Default \| QueryPercent, // 0x1E
179	UserInfo \| Default \| QueryPercent, // 0x1F
180	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // ' '
181	`0`, // '!'
182	UserInfo \| Default \| QueryPercent, // '"'
183	UserInfo \| Default \| QueryPercent \| SlashQuestionOrHash \| ForbiddenHost, // '#'
184	`0`, // '$'
185	ForbiddenHost, // '%'
186	`0`, // '&'
187	`0`, // '\''
188	`0`, // '('
189	`0`, // ')'
190	`0`, // ''*
191	ValidScheme, // '+'
192	`0`, // ','
193	ValidScheme, // '-'
194	ValidScheme, // '.'
195	UserInfo \| SlashQuestionOrHash \| ForbiddenHost, // '/'
196	ValidScheme, // '0'
197	ValidScheme, // '1'
198	ValidScheme, // '2'
199	ValidScheme, // '3'
200	ValidScheme, // '4'
201	ValidScheme, // '5'
202	ValidScheme, // '6'
203	ValidScheme, // '7'
204	ValidScheme, // '8'
205	ValidScheme, // '9'
206	UserInfo \| ForbiddenHost, // ':'
207	UserInfo, // ';'
208	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // '<'
209	UserInfo, // '='
210	UserInfo \| Default \| QueryPercent \| ForbiddenHost, // '>'
211	UserInfo \| Default \| SlashQuestionOrHash \| ForbiddenHost, // '?'
212	UserInfo \| ForbiddenHost, // '@'
213	ValidScheme, // 'A'
214	ValidScheme, // 'B'
215	ValidScheme, // 'C'
216	ValidScheme, // 'D'
217	ValidScheme, // 'E'
218	ValidScheme, // 'F'
219	ValidScheme, // 'G'
220	ValidScheme, // 'H'
221	ValidScheme, // 'I'
222	ValidScheme, // 'J'
223	ValidScheme, // 'K'
224	ValidScheme, // 'L'
225	ValidScheme, // 'M'
226	ValidScheme, // 'N'
227	ValidScheme, // 'O'
228	ValidScheme, // 'P'
229	ValidScheme, // 'Q'
230	ValidScheme, // 'R'
231	ValidScheme, // 'S'
232	ValidScheme, // 'T'
233	ValidScheme, // 'U'
234	ValidScheme, // 'V'
235	ValidScheme, // 'W'
236	ValidScheme, // 'X'
237	ValidScheme, // 'Y'
238	ValidScheme, // 'Z'
239	UserInfo \| ForbiddenHost, // '['
240	UserInfo \| SlashQuestionOrHash \| ForbiddenHost, // '\\'
241	UserInfo \| ForbiddenHost, // ']'
242	UserInfo, // '^'
243	`0`, // '_'
244	UserInfo \| Default, // '`'
245	ValidScheme, // 'a'
246	ValidScheme, // 'b'
247	ValidScheme, // 'c'
248	ValidScheme, // 'd'
249	ValidScheme, // 'e'
250	ValidScheme, // 'f'
251	ValidScheme, // 'g'
252	ValidScheme, // 'h'
253	ValidScheme, // 'i'
254	ValidScheme, // 'j'
255	ValidScheme, // 'k'
256	ValidScheme, // 'l'
257	ValidScheme, // 'm'
258	ValidScheme, // 'n'
259	ValidScheme, // 'o'
260	ValidScheme, // 'p'
261	ValidScheme, // 'q'
262	ValidScheme, // 'r'
263	ValidScheme, // 's'
264	ValidScheme, // 't'
265	ValidScheme, // 'u'
266	ValidScheme, // 'v'
267	ValidScheme, // 'w'
268	ValidScheme, // 'x'
269	ValidScheme, // 'y'
270	ValidScheme, // 'z'
271	UserInfo \| Default, // '{'
272	UserInfo, // '\|'
273	UserInfo \| Default, // '}'
274	`0`, // '~'
275	QueryPercent, // 0x7F
276	QueryPercent, // 0x80
277	QueryPercent, // 0x81
278	QueryPercent, // 0x82
279	QueryPercent, // 0x83
280	QueryPercent, // 0x84
281	QueryPercent, // 0x85
282	QueryPercent, // 0x86
283	QueryPercent, // 0x87
284	QueryPercent, // 0x88
285	QueryPercent, // 0x89
286	QueryPercent, // 0x8A
287	QueryPercent, // 0x8B
288	QueryPercent, // 0x8C
289	QueryPercent, // 0x8D
290	QueryPercent, // 0x8E
291	QueryPercent, // 0x8F
292	QueryPercent, // 0x90
293	QueryPercent, // 0x91
294	QueryPercent, // 0x92
295	QueryPercent, // 0x93
296	QueryPercent, // 0x94
297	QueryPercent, // 0x95
298	QueryPercent, // 0x96
299	QueryPercent, // 0x97
300	QueryPercent, // 0x98
301	QueryPercent, // 0x99
302	QueryPercent, // 0x9A
303	QueryPercent, // 0x9B
304	QueryPercent, // 0x9C
305	QueryPercent, // 0x9D
306	QueryPercent, // 0x9E
307	QueryPercent, // 0x9F
308	QueryPercent, // 0xA0
309	QueryPercent, // 0xA1
310	QueryPercent, // 0xA2
311	QueryPercent, // 0xA3
312	QueryPercent, // 0xA4
313	QueryPercent, // 0xA5
314	QueryPercent, // 0xA6
315	QueryPercent, // 0xA7
316	QueryPercent, // 0xA8
317	QueryPercent, // 0xA9
318	QueryPercent, // 0xAA
319	QueryPercent, // 0xAB
320	QueryPercent, // 0xAC
321	QueryPercent, // 0xAD
322	QueryPercent, // 0xAE
323	QueryPercent, // 0xAF
324	QueryPercent, // 0xB0
325	QueryPercent, // 0xB1
326	QueryPercent, // 0xB2
327	QueryPercent, // 0xB3
328	QueryPercent, // 0xB4
329	QueryPercent, // 0xB5
330	QueryPercent, // 0xB6
331	QueryPercent, // 0xB7
332	QueryPercent, // 0xB8
333	QueryPercent, // 0xB9
334	QueryPercent, // 0xBA
335	QueryPercent, // 0xBB
336	QueryPercent, // 0xBC
337	QueryPercent, // 0xBD
338	QueryPercent, // 0xBE
339	QueryPercent, // 0xBF
340	QueryPercent, // 0xC0
341	QueryPercent, // 0xC1
342	QueryPercent, // 0xC2
343	QueryPercent, // 0xC3
344	QueryPercent, // 0xC4
345	QueryPercent, // 0xC5
346	QueryPercent, // 0xC6
347	QueryPercent, // 0xC7
348	QueryPercent, // 0xC8
349	QueryPercent, // 0xC9
350	QueryPercent, // 0xCA
351	QueryPercent, // 0xCB
352	QueryPercent, // 0xCC
353	QueryPercent, // 0xCD
354	QueryPercent, // 0xCE
355	QueryPercent, // 0xCF
356	QueryPercent, // 0xD0
357	QueryPercent, // 0xD1
358	QueryPercent, // 0xD2
359	QueryPercent, // 0xD3
360	QueryPercent, // 0xD4
361	QueryPercent, // 0xD5
362	QueryPercent, // 0xD6
363	QueryPercent, // 0xD7
364	QueryPercent, // 0xD8
365	QueryPercent, // 0xD9
366	QueryPercent, // 0xDA
367	QueryPercent, // 0xDB
368	QueryPercent, // 0xDC
369	QueryPercent, // 0xDD
370	QueryPercent, // 0xDE
371	QueryPercent, // 0xDF
372	QueryPercent, // 0xE0
373	QueryPercent, // 0xE1
374	QueryPercent, // 0xE2
375	QueryPercent, // 0xE3
376	QueryPercent, // 0xE4
377	QueryPercent, // 0xE5
378	QueryPercent, // 0xE6
379	QueryPercent, // 0xE7
380	QueryPercent, // 0xE8
381	QueryPercent, // 0xE9
382	QueryPercent, // 0xEA
383	QueryPercent, // 0xEB
384	QueryPercent, // 0xEC
385	QueryPercent, // 0xED
386	QueryPercent, // 0xEE
387	QueryPercent, // 0xEF
388	QueryPercent, // 0xF0
389	QueryPercent, // 0xF1
390	QueryPercent, // 0xF2
391	QueryPercent, // 0xF3
392	QueryPercent, // 0xF4
393	QueryPercent, // 0xF5
394	QueryPercent, // 0xF6
395	QueryPercent, // 0xF7
396	QueryPercent, // 0xF8
397	QueryPercent, // 0xF9
398	QueryPercent, // 0xFA
399	QueryPercent, // 0xFB
400	QueryPercent, // 0xFC
401	QueryPercent, // 0xFD
402	QueryPercent, // 0xFE
403	QueryPercent, // 0xFF
404	};
405
406	template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= `0x1F`; }
407	template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= `0x20`; }
408	template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= `0xD` && character >= `0x9` && character != `0xB` && character != `0xC`; }
409	template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > `0x7E` \|\| isC0Control(character); }
410	template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > `0x7E` \|\| characterClassTable[character] & Default; }
411	template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > `0x7E` \|\| characterClassTable[character] & UserInfo; }
412	template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) \|\| character == `'%'`; }
413	template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= `'\\'` && characterClassTable[character] & SlashQuestionOrHash; }
414	template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= `'z'` && characterClassTable[character] & ValidScheme; }
415	template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= `']'` && characterClassTable[character] & ForbiddenHost; }
416	ALWAYS_INLINE static bool shouldPercentEncodeQueryByte(uint8_t byte, const bool& urlIsSpecial)
417	{
418	if (characterClassTable[byte] & QueryPercent)
419	return true;
420	if (byte == `'\''` && urlIsSpecial)
421	return true;
422	return false;
423	}
424
425	bool URLParser::isInUserInfoEncodeSet(UChar c)
426	{
427	return WTF::isInUserInfoEncodeSet(c);
428	}
429
430	template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation>
431	ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
432	{
433	++iterator;
434	while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
435	if (reportSyntaxViolation == ReportSyntaxViolation::Yes)
436	syntaxViolation(iteratorForSyntaxViolationPosition);
437	++iterator;
438	}
439	}
440
441	template<typename CharacterType>
442	bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator)
443	{
444	if (iterator.atEnd())
445	return false;
446	advance<CharacterType, ReportSyntaxViolation::No>(iterator);
447	if (iterator.atEnd())
448	return false;
449	advance<CharacterType, ReportSyntaxViolation::No>(iterator);
450	return iterator.atEnd();
451	}
452
453	template<typename CharacterType>
454	ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator)
455	{
456	if (iterator.atEnd() \|\| !isASCIIAlpha(*iterator))
457	return false;
458	advance<CharacterType, ReportSyntaxViolation::No>(iterator);
459	if (iterator.atEnd())
460	return false;
461	if (*iterator == `':'`)
462	return true;
463	if (UNLIKELY(*iterator == `'\|'`))
464	return true;
465	return false;
466	}
467
468	ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
469	{
470	ASSERT(isASCII(codePoint));
471	if (UNLIKELY(m_didSeeSyntaxViolation))
472	m_asciiBuffer.append(codePoint);
473	}
474
475	ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
476	{
477	if (UNLIKELY(m_didSeeSyntaxViolation))
478	m_asciiBuffer.append(characters, length);
479	}
480
481	template<typename CharacterType>
482	void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator)
483	{
484	ASSERT(isWindowsDriveLetter(iterator));
485	appendToASCIIBuffer(*iterator);
486	advance(iterator);
487	ASSERT(!iterator.atEnd());
488	ASSERT(iterator == `':'` \|\| iterator == `'\|'`);
489	if (*iterator == `'\|'`)
490	syntaxViolation(iterator);
491	appendToASCIIBuffer(`':'`);
492	advance(iterator);
493	}
494
495	bool URLParser::copyBaseWindowsDriveLetter(const URL& base)
496	{
497	if (base.protocolIs("file")) {
498	RELEASE_ASSERT(base.m_hostEnd + base.m_portLength < base.m_string.length());
499	if (base.m_string.is8Bit()) {
500	const LChar* begin = base.m_string.characters8();
501	CodePointIterator<LChar> c(begin + base.m_hostEnd + base.m_portLength + `1`, begin + base.m_string.length());
502	if (isWindowsDriveLetter(c)) {
503	appendWindowsDriveLetter(c);
504	return true;
505	}
506	} else {
507	const UChar* begin = base.m_string.characters16();
508	CodePointIterator<UChar> c(begin + base.m_hostEnd + base.m_portLength + `1`, begin + base.m_string.length());
509	if (isWindowsDriveLetter(c)) {
510	appendWindowsDriveLetter(c);
511	return true;
512	}
513	}
514	}
515	return false;
516	}
517
518	template<typename CharacterType>
519	bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator)
520	{
521	if (!isWindowsDriveLetter(iterator))
522	return true;
523	if (iterator.atEnd())
524	return false;
525	advance(iterator);
526	if (iterator.atEnd())
527	return true;
528	advance(iterator);
529	if (iterator.atEnd())
530	return true;
531	return !isSlashQuestionOrHash(*iterator);
532	}
533
534	static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer)
535	{
536	buffer.append(`'%'`);
537	buffer.append(upperNibbleToASCIIHexDigit(byte));
538	buffer.append(lowerNibbleToASCIIHexDigit(byte));
539	}
540
541	void URLParser::percentEncodeByte(uint8_t byte)
542	{
543	ASSERT(m_didSeeSyntaxViolation);
544	appendToASCIIBuffer(`'%'`);
545	appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
546	appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
547	}
548
549	const char replacementCharacterUTF8PercentEncoded[`10`] = "%EF%BF%BD";
550	const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - `1`;
551
552	template<bool(isInCodeSet)(UChar32), typename* CharacterType>
553	ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator)
554	{
555	ASSERT(!iterator.atEnd());
556	UChar32 codePoint = *iterator;
557	if (LIKELY(isASCII(codePoint))) {
558	if (UNLIKELY(isInCodeSet(codePoint))) {
559	syntaxViolation(iterator);
560	percentEncodeByte(codePoint);
561	} else
562	appendToASCIIBuffer(codePoint);
563	return;
564	}
565	ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters");
566	syntaxViolation(iterator);
567
568	if (!U_IS_UNICODE_CHAR(codePoint)) {
569	appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
570	return;
571	}
572
573	uint8_t buffer[U8_MAX_LENGTH];
574	int32_t offset = `0`;
575	U8_APPEND_UNSAFE(buffer, offset, codePoint);
576	for (int32_t i = `0`; i < offset; ++i)
577	percentEncodeByte(buffer[i]);
578	}
579
580	template<typename CharacterType>
581	ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator)
582	{
583	ASSERT(!iterator.atEnd());
584	UChar32 codePoint = *iterator;
585	if (LIKELY(isASCII(codePoint))) {
586	if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint, m_urlIsSpecial))) {
587	syntaxViolation(iterator);
588	percentEncodeByte(codePoint);
589	} else
590	appendToASCIIBuffer(codePoint);
591	return;
592	}
593
594	syntaxViolation(iterator);
595
596	if (!U_IS_UNICODE_CHAR(codePoint)) {
597	appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength);
598	return;
599	}
600
601	uint8_t buffer[U8_MAX_LENGTH];
602	int32_t offset = `0`;
603	U8_APPEND_UNSAFE(buffer, offset, codePoint);
604	for (int32_t i = `0`; i < offset; ++i) {
605	auto byte = buffer[i];
606	if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
607	percentEncodeByte(byte);
608	else
609	appendToASCIIBuffer(byte);
610	}
611	}
612
613	template<typename CharacterType>
614	void URLParser::encodeNonUTF8Query(const Vector<UChar>& source, const URLTextEncoding& encoding, CodePointIterator<CharacterType> iterator)
615	{
616	auto encoded = encoding.encodeForURLParsing(StringView (source.data(), source.size()));
617	auto* data = encoded.data();
618	size_t length = encoded.size();
619
620	if (!length == !iterator.atEnd()) {
621	syntaxViolation(iterator);
622	return;
623	}
624
625	size_t i = `0`;
626	for (; i < length; ++i) {
627	ASSERT(!iterator.atEnd());
628	uint8_t byte = data[i];
629	if (UNLIKELY(byte != *iterator)) {
630	syntaxViolation(iterator);
631	break;
632	}
633	if (UNLIKELY(shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))) {
634	syntaxViolation(iterator);
635	break;
636	}
637	appendToASCIIBuffer(byte);
638	++iterator;
639	}
640	while (!iterator.atEnd() && isTabOrNewline(*iterator))
641	++iterator;
642	ASSERT((i == length) == iterator.atEnd());
643	for (; i < length; ++i) {
644	ASSERT(m_didSeeSyntaxViolation);
645	uint8_t byte = data[i];
646	if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))
647	percentEncodeByte(byte);
648	else
649	appendToASCIIBuffer(byte);
650	}
651	}
652
653	Optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme)
654	{
655	static constexpr uint16_t ftpPort = `21`;
656	static constexpr uint16_t httpPort = `80`;
657	static constexpr uint16_t httpsPort = `443`;
658	static constexpr uint16_t wsPort = `80`;
659	static constexpr uint16_t wssPort = `443`;
660
661	auto length = scheme.length();
662	if (!length)
663	return WTF::nullopt;
664	switch (scheme [`0`]) {
665	case `'w'`:
666	switch (length) {
667	case `2`:
668	if (scheme [`1`] == `'s'`)
669	return wsPort;
670	return WTF::nullopt;
671	case `3`:
672	if (scheme [`1`] == `'s'`
673	&& scheme [`2`] == `'s'`)
674	return wssPort;
675	return WTF::nullopt;
676	default:
677	return false;
678	}
679	case `'h'`:
680	switch (length) {
681	case `4`:
682	if (scheme [`1`] == `'t'`
683	&& scheme [`2`] == `'t'`
684	&& scheme [`3`] == `'p'`)
685	return httpPort;
686	return WTF::nullopt;
687	case `5`:
688	if (scheme [`1`] == `'t'`
689	&& scheme [`2`] == `'t'`
690	&& scheme [`3`] == `'p'`
691	&& scheme [`4`] == `'s'`)
692	return httpsPort;
693	return WTF::nullopt;
694	default:
695	return WTF::nullopt;
696	}
697	case `'f'`:
698	if (length == `3`
699	&& scheme [`1`] == `'t'`
700	&& scheme [`2`] == `'p'`)
701	return ftpPort;
702	return WTF::nullopt;
703	default:
704	return WTF::nullopt;
705	}
706	}
707
708	enum class Scheme {
709	WS,
710	WSS,
711	File,
712	FTP,
713	HTTP,
714	HTTPS,
715	NonSpecial
716	};
717
718	ALWAYS_INLINE static Scheme scheme(StringView scheme)
719	{
720	auto length = scheme.length();
721	if (!length)
722	return Scheme::NonSpecial;
723	switch (scheme [`0`]) {
724	case `'f'`:
725	switch (length) {
726	case `3`:
727	if (scheme [`1`] == `'t'`
728	&& scheme [`2`] == `'p'`)
729	return Scheme::FTP;
730	return Scheme::NonSpecial;
731	case `4`:
732	if (scheme [`1`] == `'i'`
733	&& scheme [`2`] == `'l'`
734	&& scheme [`3`] == `'e'`)
735	return Scheme::File;
736	return Scheme::NonSpecial;
737	default:
738	return Scheme::NonSpecial;
739	}
740	case `'h'`:
741	switch (length) {
742	case `4`:
743	if (scheme [`1`] == `'t'`
744	&& scheme [`2`] == `'t'`
745	&& scheme [`3`] == `'p'`)
746	return Scheme::HTTP;
747	return Scheme::NonSpecial;
748	case `5`:
749	if (scheme [`1`] == `'t'`
750	&& scheme [`2`] == `'t'`
751	&& scheme [`3`] == `'p'`
752	&& scheme [`4`] == `'s'`)
753	return Scheme::HTTPS;
754	return Scheme::NonSpecial;
755	default:
756	return Scheme::NonSpecial;
757	}
758	case `'w'`:
759	switch (length) {
760	case `2`:
761	if (scheme [`1`] == `'s'`)
762	return Scheme::WS;
763	return Scheme::NonSpecial;
764	case `3`:
765	if (scheme [`1`] == `'s'`
766	&& scheme [`2`] == `'s'`)
767	return Scheme::WSS;
768	return Scheme::NonSpecial;
769	default:
770	return Scheme::NonSpecial;
771	}
772	default:
773	return Scheme::NonSpecial;
774	}
775	}
776
777	Optional<String> URLParser::maybeCanonicalizeScheme(const String& scheme)
778	{
779	if (scheme.isEmpty())
780	return WTF::nullopt;
781
782	if (!isASCIIAlpha(scheme [`0`]))
783	return WTF::nullopt;
784
785	for (size_t i = `1`; i < scheme.length(); ++i) {
786	if (isASCIIAlphanumeric(scheme [i]) \|\| scheme [i] == `'+'` \|\| scheme [i] == `'-'` \|\| scheme [i] == `'.'`)
787	continue;
788	return WTF::nullopt;
789	}
790
791	return scheme.convertToASCIILowercase();
792	}
793
794	bool URLParser::isSpecialScheme(const String& schemeArg)
795	{
796	return scheme(schemeArg) != Scheme::NonSpecial;
797	}
798
799	enum class URLParser::URLPart {
800	SchemeEnd,
801	UserStart,
802	UserEnd,
803	PasswordEnd,
804	HostEnd,
805	PortEnd,
806	PathAfterLastSlash,
807	PathEnd,
808	QueryEnd,
809	};
810
811	size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part)
812	{
813	switch (part) {
814	case URLPart::QueryEnd:
815	return url.m_queryEnd;
816	case URLPart::PathEnd:
817	return url.m_pathEnd;
818	case URLPart::PathAfterLastSlash:
819	return url.m_pathAfterLastSlash;
820	case URLPart::PortEnd:
821	return url.m_hostEnd + url.m_portLength;
822	case URLPart::HostEnd:
823	return url.m_hostEnd;
824	case URLPart::PasswordEnd:
825	return url.m_passwordEnd;
826	case URLPart::UserEnd:
827	return url.m_userEnd;
828	case URLPart::UserStart:
829	return url.m_userStart;
830	case URLPart::SchemeEnd:
831	return url.m_schemeEnd;
832	}
833	ASSERT_NOT_REACHED();
834	return `0`;
835	}
836
837	void URLParser::copyASCIIStringUntil(const String& string, size_t length)
838	{
839	RELEASE_ASSERT(length <= string.length());
840	if (string.isNull())
841	return;
842	ASSERT(m_asciiBuffer.isEmpty());
843	if (string.is8Bit())
844	appendToASCIIBuffer(string.characters8(), length);
845	else {
846	const UChar* characters = string.characters16();
847	for (size_t i = `0`; i < length; ++i) {
848	UChar c = characters[i];
849	ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c));
850	appendToASCIIBuffer(c);
851	}
852	}
853	}
854
855	template<typename CharacterType>
856	void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, const URLTextEncoding*& nonUTF8QueryEncoding)
857	{
858	syntaxViolation(iterator);
859
860	m_asciiBuffer.clear();
861	copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
862	switch (part) {
863	case URLPart::QueryEnd:
864	m_url.m_queryEnd = base.m_queryEnd;
865	FALLTHROUGH;
866	case URLPart::PathEnd:
867	m_url.m_pathEnd = base.m_pathEnd;
868	FALLTHROUGH;
869	case URLPart::PathAfterLastSlash:
870	m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash;
871	FALLTHROUGH;
872	case URLPart::PortEnd:
873	m_url.m_portLength = base.m_portLength;
874	FALLTHROUGH;
875	case URLPart::HostEnd:
876	m_url.m_hostEnd = base.m_hostEnd;
877	FALLTHROUGH;
878	case URLPart::PasswordEnd:
879	m_url.m_passwordEnd = base.m_passwordEnd;
880	FALLTHROUGH;
881	case URLPart::UserEnd:
882	m_url.m_userEnd = base.m_userEnd;
883	FALLTHROUGH;
884	case URLPart::UserStart:
885	m_url.m_userStart = base.m_userStart;
886	FALLTHROUGH;
887	case URLPart::SchemeEnd:
888	m_url.m_isValid = base.m_isValid;
889	m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily;
890	m_url.m_schemeEnd = base.m_schemeEnd;
891	}
892	switch (scheme(StringView (m_asciiBuffer.data(), m_url.m_schemeEnd))) {
893	case Scheme::WS:
894	case Scheme::WSS:
895	nonUTF8QueryEncoding = nullptr;
896	m_urlIsSpecial = true;
897	return;
898	case Scheme::File:
899	m_urlIsFile = true;
900	FALLTHROUGH;
901	case Scheme::FTP:
902	case Scheme::HTTP:
903	case Scheme::HTTPS:
904	m_urlIsSpecial = true;
905	return;
906	case Scheme::NonSpecial:
907	m_urlIsSpecial = false;
908	nonUTF8QueryEncoding = nullptr;
909	return;
910	}
911	ASSERT_NOT_REACHED();
912	}
913
914	static const char dotASCIICode[`2`] = {`'2'`, `'e'`};
915
916	template<typename CharacterType>
917	ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c)
918	{
919	if (c.atEnd())
920	return false;
921	if (*c == `'.'`) {
922	advance<CharacterType, ReportSyntaxViolation::No>(c);
923	return c.atEnd() \|\| isSlashQuestionOrHash(*c);
924	}
925	if (*c != `'%'`)
926	return false;
927	advance<CharacterType, ReportSyntaxViolation::No>(c);
928	if (c.atEnd() \|\| *c != dotASCIICode[`0`])
929	return false;
930	advance<CharacterType, ReportSyntaxViolation::No>(c);
931	if (c.atEnd())
932	return false;
933	if (toASCIILower(*c) == dotASCIICode[`1`]) {
934	advance<CharacterType, ReportSyntaxViolation::No>(c);
935	return c.atEnd() \|\| isSlashQuestionOrHash(*c);
936	}
937	return false;
938	}
939
940	template<typename CharacterType>
941	ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c)
942	{
943	if (c.atEnd())
944	return false;
945	if (*c == `'.'`) {
946	advance<CharacterType, ReportSyntaxViolation::No>(c);
947	return isSingleDotPathSegment(c);
948	}
949	if (*c != `'%'`)
950	return false;
951	advance<CharacterType, ReportSyntaxViolation::No>(c);
952	if (c.atEnd() \|\| *c != dotASCIICode[`0`])
953	return false;
954	advance<CharacterType, ReportSyntaxViolation::No>(c);
955	if (c.atEnd())
956	return false;
957	if (toASCIILower(*c) == dotASCIICode[`1`]) {
958	advance<CharacterType, ReportSyntaxViolation::No>(c);
959	return isSingleDotPathSegment(c);
960	}
961	return false;
962	}
963
964	template<typename CharacterType>
965	void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c)
966	{
967	ASSERT(isSingleDotPathSegment(c));
968	if (*c == `'.'`) {
969	advance(c);
970	if (!c.atEnd()) {
971	if (c == `'/'` \|\| c == `'\\'`)
972	advance(c);
973	else
974	ASSERT(c == `'?'` \|\| c == `'#'`);
975	}
976	} else {
977	ASSERT(*c == `'%'`);
978	advance(c);
979	ASSERT(*c == dotASCIICode[`0`]);
980	advance(c);
981	ASSERT(toASCIILower(*c) == dotASCIICode[`1`]);
982	advance(c);
983	if (!c.atEnd()) {
984	if (c == `'/'` \|\| c == `'\\'`)
985	advance(c);
986	else
987	ASSERT(c == `'?'` \|\| c == `'#'`);
988	}
989	}
990	}
991
992	template<typename CharacterType>
993	void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c)
994	{
995	ASSERT(isDoubleDotPathSegment(c));
996	if (*c == `'.'`)
997	advance(c);
998	else {
999	ASSERT(*c == `'%'`);
1000	advance(c);
1001	ASSERT(*c == dotASCIICode[`0`]);
1002	advance(c);
1003	ASSERT(toASCIILower(*c) == dotASCIICode[`1`]);
1004	advance(c);
1005	}
1006	consumeSingleDotPathSegment(c);
1007	}
1008
1009	bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash)
1010	{
1011	ASSERT(m_didSeeSyntaxViolation);
1012	if (!m_urlIsFile)
1013	return true;
1014
1015	ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size());
1016	CodePointIterator<LChar> componentToPop(&m_asciiBuffer [newPathAfterLastSlash], &m_asciiBuffer [`0`] + m_url.m_pathAfterLastSlash);
1017	if (newPathAfterLastSlash == m_url.m_hostEnd + m_url.m_portLength + `1` && isWindowsDriveLetter(componentToPop))
1018	return false;
1019	return true;
1020	}
1021
1022	void URLParser::popPath()
1023	{
1024	ASSERT(m_didSeeSyntaxViolation);
1025	if (m_url.m_pathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength + `1`) {
1026	auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - `1`;
1027	if (m_asciiBuffer [newPathAfterLastSlash] == `'/'`)
1028	newPathAfterLastSlash--;
1029	while (newPathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength && m_asciiBuffer [newPathAfterLastSlash] != `'/'`)
1030	newPathAfterLastSlash--;
1031	newPathAfterLastSlash++;
1032	if (shouldPopPath(newPathAfterLastSlash))
1033	m_url.m_pathAfterLastSlash = newPathAfterLastSlash;
1034	}
1035	m_asciiBuffer.resize(m_url.m_pathAfterLastSlash);
1036	}
1037
1038	template<typename CharacterType>
1039	void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator)
1040	{
1041	if (m_didSeeSyntaxViolation)
1042	return;
1043	m_didSeeSyntaxViolation = true;
1044
1045	ASSERT(m_asciiBuffer.isEmpty());
1046	size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1047	RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
1048	m_asciiBuffer.reserveCapacity(m_inputString.length());
1049	for (size_t i = `0`; i < codeUnitsToCopy; ++i) {
1050	ASSERT(isASCII(m_inputString[i]));
1051	m_asciiBuffer.uncheckedAppend(m_inputString [i]);
1052	}
1053	}
1054
1055	void URLParser::failure()
1056	{
1057	m_url.invalidate();
1058	m_url.m_string = m_inputString;
1059	}
1060
1061	template<typename CharacterType>
1062	bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint)
1063	{
1064	if (iterator.atEnd() \|\| toASCIILower(*iterator) != codePoint)
1065	return false;
1066	advance<CharacterType, ReportSyntaxViolation::No>(iterator);
1067	return true;
1068	}
1069
1070	template<typename CharacterType>
1071	bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator)
1072	{
1073	if (!checkLocalhostCodePoint(iterator, `'l'`))
1074	return false;
1075	if (!checkLocalhostCodePoint(iterator, `'o'`))
1076	return false;
1077	if (!checkLocalhostCodePoint(iterator, `'c'`))
1078	return false;
1079	if (!checkLocalhostCodePoint(iterator, `'a'`))
1080	return false;
1081	if (!checkLocalhostCodePoint(iterator, `'l'`))
1082	return false;
1083	if (!checkLocalhostCodePoint(iterator, `'h'`))
1084	return false;
1085	if (!checkLocalhostCodePoint(iterator, `'o'`))
1086	return false;
1087	if (!checkLocalhostCodePoint(iterator, `'s'`))
1088	return false;
1089	if (!checkLocalhostCodePoint(iterator, `'t'`))
1090	return false;
1091	return iterator.atEnd();
1092	}
1093
1094	bool URLParser::isLocalhost(StringView view)
1095	{
1096	if (view.is8Bit())
1097	return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length()));
1098	return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length()));
1099	}
1100
1101	ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length)
1102	{
1103	if (UNLIKELY(m_didSeeSyntaxViolation)) {
1104	ASSERT(start + length <= m_asciiBuffer.size());
1105	return StringView (m_asciiBuffer.data() + start, length);
1106	}
1107	ASSERT(start + length <= m_inputString.length());
1108	return StringView (m_inputString).substring(start, length);
1109	}
1110
1111	ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position)
1112	{
1113	if (UNLIKELY(m_didSeeSyntaxViolation))
1114	return m_asciiBuffer [position];
1115	return m_inputString [position];
1116	}
1117
1118	template<typename CharacterType>
1119	ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
1120	{
1121	if (UNLIKELY(m_didSeeSyntaxViolation))
1122	return m_asciiBuffer.size();
1123
1124	return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
1125	}
1126
1127	URLParser::URLParser(const String& input, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding)
1128	: m_inputString (input)
1129	{
1130	if (input.isNull()) {
1131	if (base.isValid() && !base.m_cannotBeABaseURL) {
1132	m_url = base;
1133	m_url.removeFragmentIdentifier();
1134	}
1135	return;
1136	}
1137
1138	if (input.is8Bit()) {
1139	m_inputBegin = input.characters8();
1140	parse(input.characters8(), input.length(), base, nonUTF8QueryEncoding);
1141	} else {
1142	m_inputBegin = input.characters16();
1143	parse(input.characters16(), input.length(), base, nonUTF8QueryEncoding);
1144	}
1145
1146	ASSERT(!m_url.m_isValid
1147	\|\| m_didSeeSyntaxViolation == (m_url.string() != input)
1148	\|\| (input.isAllSpecialCharacters<isC0ControlOrSpace>()
1149	&& m_url.m_string == base.m_string.left(base.m_queryEnd)));
1150	ASSERT(internalValuesConsistent(m_url));
1151	#if !ASSERT_DISABLED
1152	if (!m_didSeeSyntaxViolation) {
1153	// Force a syntax violation at the beginning to make sure we get the same result.
1154	URLParser parser(makeString(" ", input), base, nonUTF8QueryEncoding);
1155	URL parsed = parser.result();
1156	if (parsed.isValid())
1157	ASSERT(allValuesEqual(parser.result(), m_url));
1158	}
1159	#endif
1160	}
1161
1162	template<typename CharacterType>
1163	void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding)
1164	{
1165	URL_PARSER_LOG("Parsing URL <%s> base <%s>", String(input, length).utf8().data(), base.string().utf8().data());
1166	m_url = { };
1167	ASSERT(m_asciiBuffer.isEmpty());
1168
1169	Vector<UChar> queryBuffer;
1170
1171	unsigned endIndex = length;
1172	while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - `1`]))) {
1173	syntaxViolation(CodePointIterator<CharacterType>(input, input));
1174	endIndex--;
1175	}
1176	CodePointIterator<CharacterType> c(input, input + endIndex);
1177	CodePointIterator<CharacterType> authorityOrHostBegin;
1178	CodePointIterator<CharacterType> queryBegin;
1179	while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
1180	syntaxViolation(c);
1181	++c;
1182	}
1183	auto beginAfterControlAndSpace = c;
1184
1185	enum class State : uint8_t {
1186	SchemeStart,
1187	Scheme,
1188	NoScheme,
1189	SpecialRelativeOrAuthority,
1190	PathOrAuthority,
1191	Relative,
1192	RelativeSlash,
1193	SpecialAuthoritySlashes,
1194	SpecialAuthorityIgnoreSlashes,
1195	AuthorityOrHost,
1196	Host,
1197	File,
1198	FileSlash,
1199	FileHost,
1200	PathStart,
1201	Path,
1202	CannotBeABaseURLPath,
1203	UTF8Query,
1204	NonUTF8Query,
1205	Fragment,
1206	};
1207
1208	#define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c))
1209	#define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x)
1210
1211	State state = State::SchemeStart;
1212	while (!c.atEnd()) {
1213	if (UNLIKELY(isTabOrNewline(*c))) {
1214	syntaxViolation(c);
1215	++c;
1216	continue;
1217	}
1218
1219	switch (state) {
1220	case State::SchemeStart:
1221	LOG_STATE("SchemeStart");
1222	if (isASCIIAlpha(*c)) {
1223	if (UNLIKELY(isASCIIUpper(*c)))
1224	syntaxViolation(c);
1225	appendToASCIIBuffer(toASCIILower(*c));
1226	advance(c);
1227	if (c.atEnd()) {
1228	m_asciiBuffer.clear();
1229	state = State::NoScheme;
1230	c = beginAfterControlAndSpace;
1231	break;
1232	}
1233	state = State::Scheme;
1234	} else
1235	state = State::NoScheme;
1236	break;
1237	case State::Scheme:
1238	LOG_STATE("Scheme");
1239	if (isValidSchemeCharacter(*c)) {
1240	if (UNLIKELY(isASCIIUpper(*c)))
1241	syntaxViolation(c);
1242	appendToASCIIBuffer(toASCIILower(*c));
1243	} else if (*c == `':'`) {
1244	unsigned schemeEnd = currentPosition(c);
1245	if (schemeEnd > URL::maxSchemeLength) {
1246	failure();
1247	return;
1248	}
1249	m_url.m_schemeEnd = schemeEnd;
1250	StringView urlScheme = parsedDataView(`0`, m_url.m_schemeEnd);
1251	appendToASCIIBuffer(`':'`);
1252	switch (scheme(urlScheme)) {
1253	case Scheme::File:
1254	m_urlIsSpecial = true;
1255	m_urlIsFile = true;
1256	state = State::File;
1257	++c;
1258	break;
1259	case Scheme::WS:
1260	case Scheme::WSS:
1261	nonUTF8QueryEncoding = nullptr;
1262	m_urlIsSpecial = true;
1263	if (base.protocolIs(urlScheme))
1264	state = State::SpecialRelativeOrAuthority;
1265	else
1266	state = State::SpecialAuthoritySlashes;
1267	++c;
1268	break;
1269	case Scheme::HTTP:
1270	case Scheme::HTTPS:
1271	m_url.m_protocolIsInHTTPFamily = true;
1272	FALLTHROUGH;
1273	case Scheme::FTP:
1274	m_urlIsSpecial = true;
1275	if (base.protocolIs(urlScheme))
1276	state = State::SpecialRelativeOrAuthority;
1277	else
1278	state = State::SpecialAuthoritySlashes;
1279	++c;
1280	break;
1281	case Scheme::NonSpecial:
1282	nonUTF8QueryEncoding = nullptr;
1283	auto maybeSlash = c;
1284	advance(maybeSlash);
1285	if (!maybeSlash.atEnd() && *maybeSlash == `'/'`) {
1286	appendToASCIIBuffer(`'/'`);
1287	c = maybeSlash;
1288	state = State::PathOrAuthority;
1289	ASSERT(*c == `'/'`);
1290	++c;
1291	m_url.m_userStart = currentPosition(c);
1292	} else {
1293	++c;
1294	m_url.m_userStart = currentPosition(c);
1295	m_url.m_userEnd = m_url.m_userStart;
1296	m_url.m_passwordEnd = m_url.m_userStart;
1297	m_url.m_hostEnd = m_url.m_userStart;
1298	m_url.m_portLength = `0`;
1299	m_url.m_pathAfterLastSlash = m_url.m_userStart;
1300	m_url.m_cannotBeABaseURL = true;
1301	state = State::CannotBeABaseURLPath;
1302	}
1303	break;
1304	}
1305	break;
1306	} else {
1307	m_asciiBuffer.clear();
1308	state = State::NoScheme;
1309	c = beginAfterControlAndSpace;
1310	break;
1311	}
1312	advance(c);
1313	if (c.atEnd()) {
1314	m_asciiBuffer.clear();
1315	state = State::NoScheme;
1316	c = beginAfterControlAndSpace;
1317	}
1318	break;
1319	case State::NoScheme:
1320	LOG_STATE("NoScheme");
1321	if (!base.isValid() \|\| (base.m_cannotBeABaseURL && *c != `'#'`)) {
1322	failure();
1323	return;
1324	}
1325	if (base.m_cannotBeABaseURL && *c == `'#'`) {
1326	copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1327	state = State::Fragment;
1328	appendToASCIIBuffer(`'#'`);
1329	++c;
1330	break;
1331	}
1332	if (!base.protocolIs("file")) {
1333	state = State::Relative;
1334	break;
1335	}
1336	copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding);
1337	appendToASCIIBuffer(`':'`);
1338	state = State::File;
1339	break;
1340	case State::SpecialRelativeOrAuthority:
1341	LOG_STATE("SpecialRelativeOrAuthority");
1342	if (*c == `'/'`) {
1343	appendToASCIIBuffer(`'/'`);
1344	advance(c);
1345	if (c.atEnd()) {
1346	failure();
1347	return;
1348	}
1349	if (*c == `'/'`) {
1350	appendToASCIIBuffer(`'/'`);
1351	state = State::SpecialAuthorityIgnoreSlashes;
1352	++c;
1353	} else
1354	state = State::RelativeSlash;
1355	} else
1356	state = State::Relative;
1357	break;
1358	case State::PathOrAuthority:
1359	LOG_STATE("PathOrAuthority");
1360	if (*c == `'/'`) {
1361	appendToASCIIBuffer(`'/'`);
1362	state = State::AuthorityOrHost;
1363	advance(c);
1364	m_url.m_userStart = currentPosition(c);
1365	authorityOrHostBegin = c;
1366	} else {
1367	ASSERT(parsedDataView(currentPosition(c) - `1`) == `'/'`);
1368	m_url.m_userStart = currentPosition(c) - `1`;
1369	m_url.m_userEnd = m_url.m_userStart;
1370	m_url.m_passwordEnd = m_url.m_userStart;
1371	m_url.m_hostEnd = m_url.m_userStart;
1372	m_url.m_portLength = `0`;
1373	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1374	state = State::Path;
1375	}
1376	break;
1377	case State::Relative:
1378	LOG_STATE("Relative");
1379	switch (*c) {
1380	case `'/'`:
1381	case `'\\'`:
1382	state = State::RelativeSlash;
1383	++c;
1384	break;
1385	case `'?'`:
1386	copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding);
1387	appendToASCIIBuffer(`'?'`);
1388	++c;
1389	if (nonUTF8QueryEncoding) {
1390	queryBegin = c;
1391	state = State::NonUTF8Query;
1392	} else
1393	state = State::UTF8Query;
1394	break;
1395	case `'#'`:
1396	copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1397	appendToASCIIBuffer(`'#'`);
1398	state = State::Fragment;
1399	++c;
1400	break;
1401	default:
1402	copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding);
1403	if (currentPosition(c) && parsedDataView(currentPosition(c) - `1`) != `'/'`) {
1404	appendToASCIIBuffer(`'/'`);
1405	m_url.m_pathAfterLastSlash = currentPosition(c);
1406	}
1407	state = State::Path;
1408	break;
1409	}
1410	break;
1411	case State::RelativeSlash:
1412	LOG_STATE("RelativeSlash");
1413	if (c == `'/'` \|\| c == `'\\'`) {
1414	++c;
1415	copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding);
1416	appendToASCIIBuffer("://", `3`);
1417	if (m_urlIsSpecial)
1418	state = State::SpecialAuthorityIgnoreSlashes;
1419	else {
1420	m_url.m_userStart = currentPosition(c);
1421	state = State::AuthorityOrHost;
1422	authorityOrHostBegin = c;
1423	}
1424	} else {
1425	copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding);
1426	appendToASCIIBuffer(`'/'`);
1427	m_url.m_pathAfterLastSlash = base.m_hostEnd + base.m_portLength + `1`;
1428	state = State::Path;
1429	}
1430	break;
1431	case State::SpecialAuthoritySlashes:
1432	LOG_STATE("SpecialAuthoritySlashes");
1433	if (LIKELY(c == `'/'` \|\| c == `'\\'`)) {
1434	if (UNLIKELY(*c == `'\\'`))
1435	syntaxViolation(c);
1436	appendToASCIIBuffer(`'/'`);
1437	advance(c);
1438	if (LIKELY(!c.atEnd() && (c == `'/'` \|\| c == `'\\'`))) {
1439	if (UNLIKELY(*c == `'\\'`))
1440	syntaxViolation(c);
1441	++c;
1442	appendToASCIIBuffer(`'/'`);
1443	} else {
1444	syntaxViolation(c);
1445	appendToASCIIBuffer(`'/'`);
1446	}
1447	} else {
1448	syntaxViolation(c);
1449	appendToASCIIBuffer("//", `2`);
1450	}
1451	state = State::SpecialAuthorityIgnoreSlashes;
1452	break;
1453	case State::SpecialAuthorityIgnoreSlashes:
1454	LOG_STATE("SpecialAuthorityIgnoreSlashes");
1455	if (c == `'/'` \|\| c == `'\\'`) {
1456	syntaxViolation(c);
1457	++c;
1458	} else {
1459	m_url.m_userStart = currentPosition(c);
1460	state = State::AuthorityOrHost;
1461	authorityOrHostBegin = c;
1462	}
1463	break;
1464	case State::AuthorityOrHost:
1465	do {
1466	LOG_STATE("AuthorityOrHost");
1467	if (*c == `'@'`) {
1468	auto lastAt = c;
1469	auto findLastAt = c;
1470	while (!findLastAt.atEnd()) {
1471	URL_PARSER_LOG("Finding last @: %c", *findLastAt);
1472	if (*findLastAt == `'@'`)
1473	lastAt = findLastAt;
1474	bool isSlash = findLastAt == `'/'` \|\| (m_urlIsSpecial && findLastAt == `'\\'`);
1475	if (isSlash \|\| findLastAt == `'?'` \|\| findLastAt == `'#'`)
1476	break;
1477	++findLastAt;
1478	}
1479	parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt));
1480	c = lastAt;
1481	advance(c);
1482	authorityOrHostBegin = c;
1483	state = State::Host;
1484	m_hostHasPercentOrNonASCII = false;
1485	break;
1486	}
1487	bool isSlash = c == `'/'` \|\| (m_urlIsSpecial && c == `'\\'`);
1488	if (isSlash \|\| c == `'?'` \|\| c == `'#'`) {
1489	auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c);
1490	if (iterator.atEnd()) {
1491	if (m_urlIsSpecial)
1492	return failure();
1493	m_url.m_userEnd = currentPosition(c);
1494	m_url.m_passwordEnd = m_url.m_userEnd;
1495	m_url.m_hostEnd = m_url.m_userEnd;
1496	m_url.m_portLength = `0`;
1497	m_url.m_pathAfterLastSlash = m_url.m_userEnd;
1498	} else {
1499	m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1500	m_url.m_passwordEnd = m_url.m_userEnd;
1501	if (!parseHostAndPort(iterator)) {
1502	failure();
1503	return;
1504	}
1505	if (UNLIKELY(!isSlash)) {
1506	if (m_urlIsSpecial) {
1507	syntaxViolation(c);
1508	appendToASCIIBuffer(`'/'`);
1509	}
1510	m_url.m_pathAfterLastSlash = currentPosition(c);
1511	}
1512	}
1513	state = State::Path;
1514	break;
1515	}
1516	if (isPercentOrNonASCII(*c))
1517	m_hostHasPercentOrNonASCII = true;
1518	++c;
1519	} while (!c.atEnd());
1520	break;
1521	case State::Host:
1522	do {
1523	LOG_STATE("Host");
1524	if (c == `'/'` \|\| c == `'?'` \|\| *c == `'#'`) {
1525	if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1526	failure();
1527	return;
1528	}
1529	if (c == `'?'` \|\| c == `'#'`) {
1530	syntaxViolation(c);
1531	appendToASCIIBuffer(`'/'`);
1532	m_url.m_pathAfterLastSlash = currentPosition(c);
1533	}
1534	state = State::Path;
1535	break;
1536	}
1537	if (isPercentOrNonASCII(*c))
1538	m_hostHasPercentOrNonASCII = true;
1539	++c;
1540	} while (!c.atEnd());
1541	break;
1542	case State::File:
1543	LOG_STATE("File");
1544	switch (*c) {
1545	case `'\\'`:
1546	syntaxViolation(c);
1547	FALLTHROUGH;
1548	case `'/'`:
1549	appendToASCIIBuffer(`'/'`);
1550	state = State::FileSlash;
1551	++c;
1552	break;
1553	case `'?'`:
1554	syntaxViolation(c);
1555	if (base.isValid() && base.protocolIs("file")) {
1556	copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding);
1557	appendToASCIIBuffer(`'?'`);
1558	++c;
1559	} else {
1560	appendToASCIIBuffer("///?", `4`);
1561	++c;
1562	m_url.m_userStart = currentPosition(c) - `2`;
1563	m_url.m_userEnd = m_url.m_userStart;
1564	m_url.m_passwordEnd = m_url.m_userStart;
1565	m_url.m_hostEnd = m_url.m_userStart;
1566	m_url.m_portLength = `0`;
1567	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1568	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1569	}
1570	if (nonUTF8QueryEncoding) {
1571	queryBegin = c;
1572	state = State::NonUTF8Query;
1573	} else
1574	state = State::UTF8Query;
1575	break;
1576	case `'#'`:
1577	syntaxViolation(c);
1578	if (base.isValid() && base.protocolIs("file")) {
1579	copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1580	appendToASCIIBuffer(`'#'`);
1581	} else {
1582	appendToASCIIBuffer("///#", `4`);
1583	m_url.m_userStart = currentPosition(c) - `2`;
1584	m_url.m_userEnd = m_url.m_userStart;
1585	m_url.m_passwordEnd = m_url.m_userStart;
1586	m_url.m_hostEnd = m_url.m_userStart;
1587	m_url.m_portLength = `0`;
1588	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1589	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1590	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1591	}
1592	state = State::Fragment;
1593	++c;
1594	break;
1595	default:
1596	syntaxViolation(c);
1597	if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c))
1598	copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding);
1599	else {
1600	appendToASCIIBuffer("///", `3`);
1601	m_url.m_userStart = currentPosition(c) - `1`;
1602	m_url.m_userEnd = m_url.m_userStart;
1603	m_url.m_passwordEnd = m_url.m_userStart;
1604	m_url.m_hostEnd = m_url.m_userStart;
1605	m_url.m_portLength = `0`;
1606	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1607	if (isWindowsDriveLetter(c))
1608	appendWindowsDriveLetter(c);
1609	}
1610	state = State::Path;
1611	break;
1612	}
1613	break;
1614	case State::FileSlash:
1615	LOG_STATE("FileSlash");
1616	if (LIKELY(c == `'/'` \|\| c == `'\\'`)) {
1617	if (UNLIKELY(*c == `'\\'`))
1618	syntaxViolation(c);
1619	appendToASCIIBuffer(`'/'`);
1620	advance(c);
1621	m_url.m_userStart = currentPosition(c);
1622	m_url.m_userEnd = m_url.m_userStart;
1623	m_url.m_passwordEnd = m_url.m_userStart;
1624	m_url.m_hostEnd = m_url.m_userStart;
1625	m_url.m_portLength = `0`;
1626	authorityOrHostBegin = c;
1627	state = State::FileHost;
1628	break;
1629	}
1630	syntaxViolation(c);
1631	appendToASCIIBuffer("//", `2`);
1632	m_url.m_userStart = currentPosition(c) - `1`;
1633	m_url.m_userEnd = m_url.m_userStart;
1634	m_url.m_passwordEnd = m_url.m_userStart;
1635	m_url.m_hostEnd = m_url.m_userStart;
1636	m_url.m_portLength = `0`;
1637	if (isWindowsDriveLetter(c)) {
1638	appendWindowsDriveLetter(c);
1639	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1640	} else if (copyBaseWindowsDriveLetter(base)) {
1641	appendToASCIIBuffer(`'/'`);
1642	m_url.m_pathAfterLastSlash = m_url.m_userStart + `4`;
1643	} else
1644	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1645	state = State::Path;
1646	break;
1647	case State::FileHost:
1648	do {
1649	LOG_STATE("FileHost");
1650	if (isSlashQuestionOrHash(*c)) {
1651	bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1652	&& isWindowsDriveLetter(authorityOrHostBegin);
1653	if (windowsQuirk) {
1654	syntaxViolation(authorityOrHostBegin);
1655	appendToASCIIBuffer(`'/'`);
1656	appendWindowsDriveLetter(authorityOrHostBegin);
1657	}
1658	if (windowsQuirk \|\| authorityOrHostBegin == c) {
1659	ASSERT(windowsQuirk \|\| parsedDataView(currentPosition(c) - `1`) == `'/'`);
1660	if (UNLIKELY(*c == `'?'`)) {
1661	syntaxViolation(c);
1662	appendToASCIIBuffer("/?", `2`);
1663	++c;
1664	if (nonUTF8QueryEncoding) {
1665	queryBegin = c;
1666	state = State::NonUTF8Query;
1667	} else
1668	state = State::UTF8Query;
1669	m_url.m_pathAfterLastSlash = currentPosition(c) - `1`;
1670	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1671	break;
1672	}
1673	if (UNLIKELY(*c == `'#'`)) {
1674	syntaxViolation(c);
1675	appendToASCIIBuffer("/#", `2`);
1676	++c;
1677	m_url.m_pathAfterLastSlash = currentPosition(c) - `1`;
1678	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1679	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1680	state = State::Fragment;
1681	break;
1682	}
1683	state = State::Path;
1684	break;
1685	}
1686	if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1687	failure();
1688	return;
1689	}
1690	if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) {
1691	syntaxViolation(c);
1692	m_asciiBuffer.shrink(m_url.m_passwordEnd);
1693	m_url.m_hostEnd = currentPosition(c);
1694	m_url.m_portLength = `0`;
1695	}
1696
1697	state = State::PathStart;
1698	break;
1699	}
1700	if (isPercentOrNonASCII(*c))
1701	m_hostHasPercentOrNonASCII = true;
1702	++c;
1703	} while (!c.atEnd());
1704	break;
1705	case State::PathStart:
1706	LOG_STATE("PathStart");
1707	if (c != `'/'` && c != `'\\'`) {
1708	syntaxViolation(c);
1709	appendToASCIIBuffer(`'/'`);
1710	}
1711	m_url.m_pathAfterLastSlash = currentPosition(c);
1712	state = State::Path;
1713	break;
1714	case State::Path:
1715	LOG_STATE("Path");
1716	if (c == `'/'` \|\| (m_urlIsSpecial && c == `'\\'`)) {
1717	if (UNLIKELY(m_urlIsSpecial && *c == `'\\'`))
1718	syntaxViolation(c);
1719	appendToASCIIBuffer(`'/'`);
1720	++c;
1721	m_url.m_pathAfterLastSlash = currentPosition(c);
1722	break;
1723	}
1724	if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - `1`) == `'/'`)) {
1725	if (UNLIKELY(isDoubleDotPathSegment(c))) {
1726	syntaxViolation(c);
1727	consumeDoubleDotPathSegment(c);
1728	popPath();
1729	break;
1730	}
1731	if (UNLIKELY(isSingleDotPathSegment(c))) {
1732	syntaxViolation(c);
1733	consumeSingleDotPathSegment(c);
1734	break;
1735	}
1736	}
1737	if (*c == `'?'`) {
1738	m_url.m_pathEnd = currentPosition(c);
1739	appendToASCIIBuffer(`'?'`);
1740	++c;
1741	if (nonUTF8QueryEncoding) {
1742	queryBegin = c;
1743	state = State::NonUTF8Query;
1744	} else
1745	state = State::UTF8Query;
1746	break;
1747	}
1748	if (*c == `'#'`) {
1749	m_url.m_pathEnd = currentPosition(c);
1750	m_url.m_queryEnd = m_url.m_pathEnd;
1751	state = State::Fragment;
1752	break;
1753	}
1754	utf8PercentEncode<isInDefaultEncodeSet>(c);
1755	++c;
1756	break;
1757	case State::CannotBeABaseURLPath:
1758	LOG_STATE("CannotBeABaseURLPath");
1759	if (*c == `'?'`) {
1760	m_url.m_pathEnd = currentPosition(c);
1761	appendToASCIIBuffer(`'?'`);
1762	++c;
1763	if (nonUTF8QueryEncoding) {
1764	queryBegin = c;
1765	state = State::NonUTF8Query;
1766	} else
1767	state = State::UTF8Query;
1768	} else if (*c == `'#'`) {
1769	m_url.m_pathEnd = currentPosition(c);
1770	m_url.m_queryEnd = m_url.m_pathEnd;
1771	state = State::Fragment;
1772	} else if (*c == `'/'`) {
1773	appendToASCIIBuffer(`'/'`);
1774	++c;
1775	m_url.m_pathAfterLastSlash = currentPosition(c);
1776	} else {
1777	utf8PercentEncode<isInSimpleEncodeSet>(c);
1778	++c;
1779	}
1780	break;
1781	case State::UTF8Query:
1782	LOG_STATE("UTF8Query");
1783	ASSERT(queryBegin == CodePointIterator<CharacterType>());
1784	if (*c == `'#'`) {
1785	m_url.m_queryEnd = currentPosition(c);
1786	state = State::Fragment;
1787	break;
1788	}
1789	ASSERT(!nonUTF8QueryEncoding);
1790	utf8QueryEncode(c);
1791	++c;
1792	break;
1793	case State::NonUTF8Query:
1794	do {
1795	LOG_STATE("NonUTF8Query");
1796	ASSERT(queryBegin != CodePointIterator<CharacterType>());
1797	if (*c == `'#'`) {
1798	encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c));
1799	m_url.m_queryEnd = currentPosition(c);
1800	state = State::Fragment;
1801	break;
1802	}
1803	appendCodePoint(queryBuffer, *c);
1804	advance(c, queryBegin);
1805	} while (!c.atEnd());
1806	break;
1807	case State::Fragment:
1808	URL_PARSER_LOG("State Fragment");
1809	utf8PercentEncode<isInSimpleEncodeSet>(c);
1810	++c;
1811	break;
1812	}
1813	}
1814
1815	switch (state) {
1816	case State::SchemeStart:
1817	LOG_FINAL_STATE("SchemeStart");
1818	if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) {
1819	m_url = base;
1820	m_url.removeFragmentIdentifier();
1821	return;
1822	}
1823	failure();
1824	return;
1825	case State::Scheme:
1826	LOG_FINAL_STATE("Scheme");
1827	failure();
1828	return;
1829	case State::NoScheme:
1830	LOG_FINAL_STATE("NoScheme");
1831	RELEASE_ASSERT_NOT_REACHED();
1832	case State::SpecialRelativeOrAuthority:
1833	LOG_FINAL_STATE("SpecialRelativeOrAuthority");
1834	copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1835	break;
1836	case State::PathOrAuthority:
1837	LOG_FINAL_STATE("PathOrAuthority");
1838	ASSERT(m_url.m_userStart);
1839	ASSERT(m_url.m_userStart == currentPosition(c));
1840	ASSERT(parsedDataView(currentPosition(c) - `1`) == `'/'`);
1841	m_url.m_userStart--;
1842	m_url.m_userEnd = m_url.m_userStart;
1843	m_url.m_passwordEnd = m_url.m_userStart;
1844	m_url.m_hostEnd = m_url.m_userStart;
1845	m_url.m_portLength = `0`;
1846	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1847	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1848	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1849	break;
1850	case State::Relative:
1851	LOG_FINAL_STATE("Relative");
1852	RELEASE_ASSERT_NOT_REACHED();
1853	case State::RelativeSlash:
1854	LOG_FINAL_STATE("RelativeSlash");
1855	copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding);
1856	appendToASCIIBuffer(`'/'`);
1857	m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + `1`;
1858	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1859	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1860	break;
1861	case State::SpecialAuthoritySlashes:
1862	LOG_FINAL_STATE("SpecialAuthoritySlashes");
1863	m_url.m_userStart = currentPosition(c);
1864	m_url.m_userEnd = m_url.m_userStart;
1865	m_url.m_passwordEnd = m_url.m_userStart;
1866	m_url.m_hostEnd = m_url.m_userStart;
1867	m_url.m_portLength = `0`;
1868	m_url.m_pathAfterLastSlash = m_url.m_userStart;
1869	m_url.m_pathEnd = m_url.m_userStart;
1870	m_url.m_queryEnd = m_url.m_userStart;
1871	break;
1872	case State::SpecialAuthorityIgnoreSlashes:
1873	LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
1874	failure();
1875	return;
1876	case State::AuthorityOrHost:
1877	LOG_FINAL_STATE("AuthorityOrHost");
1878	m_url.m_userEnd = currentPosition(authorityOrHostBegin);
1879	m_url.m_passwordEnd = m_url.m_userEnd;
1880	if (authorityOrHostBegin.atEnd()) {
1881	m_url.m_userEnd = m_url.m_userStart;
1882	m_url.m_passwordEnd = m_url.m_userStart;
1883	m_url.m_hostEnd = m_url.m_userStart;
1884	m_url.m_portLength = `0`;
1885	m_url.m_pathEnd = m_url.m_userStart;
1886	} else if (!parseHostAndPort(authorityOrHostBegin)) {
1887	failure();
1888	return;
1889	} else {
1890	if (m_urlIsSpecial) {
1891	syntaxViolation(c);
1892	appendToASCIIBuffer(`'/'`);
1893	m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + `1`;
1894	} else
1895	m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1896	}
1897	m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1898	m_url.m_queryEnd = m_url.m_pathEnd;
1899	break;
1900	case State::Host:
1901	LOG_FINAL_STATE("Host");
1902	if (!parseHostAndPort(authorityOrHostBegin)) {
1903	failure();
1904	return;
1905	}
1906	if (m_urlIsSpecial) {
1907	syntaxViolation(c);
1908	appendToASCIIBuffer(`'/'`);
1909	m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + `1`;
1910	} else
1911	m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength;
1912	m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
1913	m_url.m_queryEnd = m_url.m_pathEnd;
1914	break;
1915	case State::File:
1916	LOG_FINAL_STATE("File");
1917	if (base.isValid() && base.protocolIs("file")) {
1918	copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding);
1919	break;
1920	}
1921	syntaxViolation(c);
1922	appendToASCIIBuffer("///", `3`);
1923	m_url.m_userStart = currentPosition(c) - `1`;
1924	m_url.m_userEnd = m_url.m_userStart;
1925	m_url.m_passwordEnd = m_url.m_userStart;
1926	m_url.m_hostEnd = m_url.m_userStart;
1927	m_url.m_portLength = `0`;
1928	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1929	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1930	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1931	break;
1932	case State::FileSlash:
1933	LOG_FINAL_STATE("FileSlash");
1934	syntaxViolation(c);
1935	m_url.m_userStart = currentPosition(c) + `1`;
1936	appendToASCIIBuffer("//", `2`);
1937	m_url.m_userEnd = m_url.m_userStart;
1938	m_url.m_passwordEnd = m_url.m_userStart;
1939	m_url.m_hostEnd = m_url.m_userStart;
1940	m_url.m_portLength = `0`;
1941	if (copyBaseWindowsDriveLetter(base)) {
1942	appendToASCIIBuffer(`'/'`);
1943	m_url.m_pathAfterLastSlash = m_url.m_userStart + `4`;
1944	} else
1945	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1946	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1947	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1948	break;
1949	case State::FileHost:
1950	LOG_FINAL_STATE("FileHost");
1951	if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c))
1952	&& isWindowsDriveLetter(authorityOrHostBegin)) {
1953	syntaxViolation(authorityOrHostBegin);
1954	appendToASCIIBuffer(`'/'`);
1955	appendWindowsDriveLetter(authorityOrHostBegin);
1956	m_url.m_pathAfterLastSlash = currentPosition(c);
1957	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1958	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1959	break;
1960	}
1961
1962	if (authorityOrHostBegin == c) {
1963	syntaxViolation(c);
1964	appendToASCIIBuffer(`'/'`);
1965	m_url.m_userStart = currentPosition(c) - `1`;
1966	m_url.m_userEnd = m_url.m_userStart;
1967	m_url.m_passwordEnd = m_url.m_userStart;
1968	m_url.m_hostEnd = m_url.m_userStart;
1969	m_url.m_portLength = `0`;
1970	m_url.m_pathAfterLastSlash = m_url.m_userStart + `1`;
1971	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1972	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1973	break;
1974	}
1975
1976	if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) {
1977	failure();
1978	return;
1979	}
1980
1981	syntaxViolation(c);
1982	if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) {
1983	m_asciiBuffer.shrink(m_url.m_passwordEnd);
1984	m_url.m_hostEnd = currentPosition(c);
1985	m_url.m_portLength = `0`;
1986	}
1987	appendToASCIIBuffer(`'/'`);
1988	m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + `1`;
1989	m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
1990	m_url.m_queryEnd = m_url.m_pathAfterLastSlash;
1991	break;
1992	case State::PathStart:
1993	LOG_FINAL_STATE("PathStart");
1994	RELEASE_ASSERT_NOT_REACHED();
1995	case State::Path:
1996	LOG_FINAL_STATE("Path");
1997	m_url.m_pathEnd = currentPosition(c);
1998	m_url.m_queryEnd = m_url.m_pathEnd;
1999	break;
2000	case State::CannotBeABaseURLPath:
2001	LOG_FINAL_STATE("CannotBeABaseURLPath");
2002	m_url.m_pathEnd = currentPosition(c);
2003	m_url.m_queryEnd = m_url.m_pathEnd;
2004	break;
2005	case State::UTF8Query:
2006	LOG_FINAL_STATE("UTF8Query");
2007	ASSERT(queryBegin == CodePointIterator<CharacterType>());
2008	m_url.m_queryEnd = currentPosition(c);
2009	break;
2010	case State::NonUTF8Query:
2011	LOG_FINAL_STATE("NonUTF8Query");
2012	ASSERT(queryBegin != CodePointIterator<CharacterType>());
2013	encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c));
2014	m_url.m_queryEnd = currentPosition(c);
2015	break;
2016	case State::Fragment:
2017	LOG_FINAL_STATE("Fragment");
2018	break;
2019	}
2020
2021	if (LIKELY(!m_didSeeSyntaxViolation)) {
2022	m_url.m_string = m_inputString;
2023	ASSERT(m_asciiBuffer.isEmpty());
2024	} else
2025	m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
2026	m_url.m_isValid = true;
2027	URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
2028	}
2029
2030	template<typename CharacterType>
2031	void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
2032	{
2033	if (UNLIKELY(iterator.atEnd())) {
2034	syntaxViolation(iterator);
2035	m_url.m_userEnd = currentPosition(iterator);
2036	m_url.m_passwordEnd = m_url.m_userEnd;
2037	return;
2038	}
2039	for (; !iterator.atEnd(); advance(iterator)) {
2040	if (*iterator == `':'`) {
2041	m_url.m_userEnd = currentPosition(iterator);
2042	auto iteratorAtColon = iterator;
2043	++iterator;
2044	bool tabOrNewlineAfterColon = false;
2045	while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) {
2046	tabOrNewlineAfterColon = true;
2047	++iterator;
2048	}
2049	if (UNLIKELY(iterator.atEnd())) {
2050	syntaxViolation(iteratorAtColon);
2051	m_url.m_passwordEnd = m_url.m_userEnd;
2052	if (m_url.m_userEnd > m_url.m_userStart)
2053	appendToASCIIBuffer(`'@'`);
2054	return;
2055	}
2056	if (tabOrNewlineAfterColon)
2057	syntaxViolation(iteratorAtColon);
2058	appendToASCIIBuffer(`':'`);
2059	break;
2060	}
2061	utf8PercentEncode<WTF::isInUserInfoEncodeSet>(iterator);
2062	}
2063	for (; !iterator.atEnd(); advance(iterator))
2064	utf8PercentEncode<WTF::isInUserInfoEncodeSet>(iterator);
2065	m_url.m_passwordEnd = currentPosition(iterator);
2066	if (!m_url.m_userEnd)
2067	m_url.m_userEnd = m_url.m_passwordEnd;
2068	appendToASCIIBuffer(`'@'`);
2069	}
2070
2071	template<typename UnsignedIntegerType>
2072	void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number)
2073	{
2074	LChar buf[sizeof(UnsignedIntegerType) * `3` + `1`];
2075	LChar* end = std::end(buf);
2076	LChar* p = end;
2077	do {
2078	*--p = (number % `10`) + `'0'`;
2079	number /= `10`;
2080	} while (number);
2081	appendToASCIIBuffer(p, end - p);
2082	}
2083
2084	void URLParser::serializeIPv4(IPv4Address address)
2085	{
2086	appendNumberToASCIIBuffer<uint8_t>(address >> `24`);
2087	appendToASCIIBuffer(`'.'`);
2088	appendNumberToASCIIBuffer<uint8_t>(address >> `16`);
2089	appendToASCIIBuffer(`'.'`);
2090	appendNumberToASCIIBuffer<uint8_t>(address >> `8`);
2091	appendToASCIIBuffer(`'.'`);
2092	appendNumberToASCIIBuffer<uint8_t>(address);
2093	}
2094
2095	static size_t zeroSequenceLength(const std::array<uint16_t, `8`>& address, size_t begin)
2096	{
2097	size_t end = begin;
2098	for (; end < `8`; end++) {
2099	if (address [end])
2100	break;
2101	}
2102	return end - begin;
2103	}
2104
2105	static Optional<size_t> findLongestZeroSequence(const std::array<uint16_t, `8`>& address)
2106	{
2107	Optional<size_t> longest;
2108	size_t longestLength = `0`;
2109	for (size_t i = `0`; i < `8`; i++) {
2110	size_t length = zeroSequenceLength(address, i);
2111	if (length) {
2112	if (length > `1` && (!longest \|\| longestLength < length)) {
2113	longest = i;
2114	longestLength = length;
2115	}
2116	i += length;
2117	}
2118	}
2119	return longest;
2120	}
2121
2122	void URLParser::serializeIPv6Piece(uint16_t piece)
2123	{
2124	bool printed = false;
2125	if (auto nibble0 = piece >> `12`) {
2126	appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0));
2127	printed = true;
2128	}
2129	auto nibble1 = piece >> `8` & `0xF`;
2130	if (printed \|\| nibble1) {
2131	appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1));
2132	printed = true;
2133	}
2134	auto nibble2 = piece >> `4` & `0xF`;
2135	if (printed \|\| nibble2)
2136	appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2));
2137	appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & `0xF`));
2138	}
2139
2140	void URLParser::serializeIPv6(URLParser::IPv6Address address)
2141	{
2142	appendToASCIIBuffer(`'['`);
2143	auto compressPointer = findLongestZeroSequence(address);
2144	for (size_t piece = `0`; piece < `8`; piece++) {
2145	if (compressPointer && compressPointer.value() == piece) {
2146	ASSERT(!address[piece]);
2147	if (piece)
2148	appendToASCIIBuffer(`':'`);
2149	else
2150	appendToASCIIBuffer("::", `2`);
2151	while (piece < `8` && !address [piece])
2152	piece++;
2153	if (piece == `8`)
2154	break;
2155	}
2156	serializeIPv6Piece(address [piece]);
2157	if (piece < `7`)
2158	appendToASCIIBuffer(`':'`);
2159	}
2160	appendToASCIIBuffer(`']'`);
2161	}
2162
2163	enum class URLParser::IPv4PieceParsingError {
2164	Failure,
2165	Overflow,
2166	};
2167
2168	template<typename CharacterType>
2169	Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation)
2170	{
2171	enum class State : uint8_t {
2172	UnknownBase,
2173	Decimal,
2174	OctalOrHex,
2175	Octal,
2176	Hex,
2177	};
2178	State state = State::UnknownBase;
2179	Checked<uint32_t, RecordOverflow> value = `0`;
2180	if (!iterator.atEnd() && *iterator == `'.'`)
2181	return makeUnexpected(IPv4PieceParsingError::Failure);
2182	while (!iterator.atEnd()) {
2183	if (isTabOrNewline(*iterator)) {
2184	didSeeSyntaxViolation = true;
2185	++iterator;
2186	continue;
2187	}
2188	if (*iterator == `'.'`) {
2189	ASSERT(!value.hasOverflowed());
2190	return value.unsafeGet();
2191	}
2192	switch (state) {
2193	case State::UnknownBase:
2194	if (UNLIKELY(*iterator == `'0'`)) {
2195	++iterator;
2196	state = State::OctalOrHex;
2197	break;
2198	}
2199	state = State::Decimal;
2200	break;
2201	case State::OctalOrHex:
2202	didSeeSyntaxViolation = true;
2203	if (iterator == `'x'` \|\| iterator == `'X'`) {
2204	++iterator;
2205	state = State::Hex;
2206	break;
2207	}
2208	state = State::Octal;
2209	break;
2210	case State::Decimal:
2211	if (!isASCIIDigit(*iterator))
2212	return makeUnexpected(IPv4PieceParsingError::Failure);
2213	value *= `10`;
2214	value += *iterator - `'0'`;
2215	if (UNLIKELY(value.hasOverflowed()))
2216	return makeUnexpected(IPv4PieceParsingError::Overflow);
2217	++iterator;
2218	break;
2219	case State::Octal:
2220	ASSERT(didSeeSyntaxViolation);
2221	if (iterator < `'0'` \|\| iterator > `'7'`)
2222	return makeUnexpected(IPv4PieceParsingError::Failure);
2223	value *= `8`;
2224	value += *iterator - `'0'`;
2225	if (UNLIKELY(value.hasOverflowed()))
2226	return makeUnexpected(IPv4PieceParsingError::Overflow);
2227	++iterator;
2228	break;
2229	case State::Hex:
2230	ASSERT(didSeeSyntaxViolation);
2231	if (!isASCIIHexDigit(*iterator))
2232	return makeUnexpected(IPv4PieceParsingError::Failure);
2233	value *= `16`;
2234	value += toASCIIHexValue(*iterator);
2235	if (UNLIKELY(value.hasOverflowed()))
2236	return makeUnexpected(IPv4PieceParsingError::Overflow);
2237	++iterator;
2238	break;
2239	}
2240	}
2241	ASSERT(!value.hasOverflowed());
2242	return value.unsafeGet();
2243	}
2244
2245	ALWAYS_INLINE static uint64_t pow256(size_t exponent)
2246	{
2247	RELEASE_ASSERT(exponent <= `4`);
2248	uint64_t values[`5`] = {`1`, `256`, `256` * `256`, `256` * `256` * `256`, `256ull` * `256` * `256` * `256` };
2249	return values[exponent];
2250	}
2251
2252	enum class URLParser::IPv4ParsingError {
2253	Failure,
2254	NotIPv4,
2255	};
2256
2257	template<typename CharacterTypeForSyntaxViolation, typename CharacterType>
2258	Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator)
2259	{
2260	Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, `4`> items;
2261	bool didSeeSyntaxViolation = false;
2262	if (!iterator.atEnd() && *iterator == `'.'`)
2263	return makeUnexpected(IPv4ParsingError::NotIPv4);
2264	while (!iterator.atEnd()) {
2265	if (isTabOrNewline(*iterator)) {
2266	didSeeSyntaxViolation = true;
2267	++iterator;
2268	continue;
2269	}
2270	if (items.size() >= `4`)
2271	return makeUnexpected(IPv4ParsingError::NotIPv4);
2272	items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation));
2273	if (!iterator.atEnd() && *iterator == `'.'`) {
2274	++iterator;
2275	if (iterator.atEnd())
2276	didSeeSyntaxViolation = true;
2277	else if (*iterator == `'.'`)
2278	return makeUnexpected(IPv4ParsingError::NotIPv4);
2279	}
2280	}
2281	if (!iterator.atEnd() \|\| !items.size() \|\| items.size() > `4`)
2282	return makeUnexpected(IPv4ParsingError::NotIPv4);
2283	for (const auto& item : items) {
2284	if (!item.has_value() && item.error() == IPv4PieceParsingError::Failure)
2285	return makeUnexpected(IPv4ParsingError::NotIPv4);
2286	}
2287	for (const auto& item : items) {
2288	if (!item.has_value() && item.error() == IPv4PieceParsingError::Overflow)
2289	return makeUnexpected(IPv4ParsingError::Failure);
2290	}
2291	if (items.size() > `1`) {
2292	for (size_t i = `0`; i < items.size() - `1`; i++) {
2293	if (items [i].value() > `255`)
2294	return makeUnexpected(IPv4ParsingError::Failure);
2295	}
2296	}
2297	if (items [items.size() - `1`].value() >= pow256(`5` - items.size()))
2298	return makeUnexpected(IPv4ParsingError::Failure);
2299
2300	if (didSeeSyntaxViolation)
2301	syntaxViolation(iteratorForSyntaxViolationPosition);
2302	for (const auto& item : items) {
2303	if (item.value() > `255`)
2304	syntaxViolation(iteratorForSyntaxViolationPosition);
2305	}
2306
2307	if (UNLIKELY(items.size() != `4`))
2308	syntaxViolation(iteratorForSyntaxViolationPosition);
2309
2310	IPv4Address ipv4 = items.takeLast().value();
2311	for (size_t counter = `0`; counter < items.size(); ++counter)
2312	ipv4 += items [counter].value() * pow256(`3` - counter);
2313	return ipv4;
2314	}
2315
2316	template<typename CharacterType>
2317	Optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator)
2318	{
2319	if (iterator.atEnd())
2320	return WTF::nullopt;
2321	uint32_t piece = `0`;
2322	bool leadingZeros = false;
2323	size_t digitCount = `0`;
2324	while (!iterator.atEnd()) {
2325	if (!isASCIIDigit(*iterator))
2326	return WTF::nullopt;
2327	++digitCount;
2328	if (!piece && *iterator == `'0'`) {
2329	if (leadingZeros)
2330	return WTF::nullopt;
2331	leadingZeros = true;
2332	}
2333	if (!piece && *iterator == `'0'`)
2334	leadingZeros = true;
2335	piece = piece * `10` + *iterator - `'0'`;
2336	if (piece > `255`)
2337	return WTF::nullopt;
2338	advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2339	if (iterator.atEnd())
2340	break;
2341	if (*iterator == `'.'`)
2342	break;
2343	}
2344	if (piece && leadingZeros)
2345	return WTF::nullopt;
2346	return piece;
2347	}
2348
2349	template<typename CharacterType>
2350	Optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator)
2351	{
2352	IPv4Address address = `0`;
2353	for (size_t i = `0`; i < `4`; ++i) {
2354	if (Optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator))
2355	address = (address << `8`) + piece.value();
2356	else
2357	return WTF::nullopt;
2358	if (i < `3`) {
2359	if (iterator.atEnd())
2360	return WTF::nullopt;
2361	if (*iterator != `'.'`)
2362	return WTF::nullopt;
2363	advance<CharacterType, ReportSyntaxViolation::No>(iterator);
2364	} else if (!iterator.atEnd())
2365	return WTF::nullopt;
2366	}
2367	ASSERT(iterator.atEnd());
2368	return address;
2369	}
2370
2371	template<typename CharacterType>
2372	Optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c)
2373	{
2374	ASSERT(*c == `'['`);
2375	const auto hostBegin = c;
2376	advance(c, hostBegin);
2377	if (c.atEnd())
2378	return WTF::nullopt;
2379
2380	IPv6Address address = {{`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`}};
2381	size_t piecePointer = `0`;
2382	Optional<size_t> compressPointer;
2383	bool previousValueWasZero = false;
2384	bool immediatelyAfterCompress = false;
2385
2386	if (*c == `':'`) {
2387	advance(c, hostBegin);
2388	if (c.atEnd())
2389	return WTF::nullopt;
2390	if (*c != `':'`)
2391	return WTF::nullopt;
2392	advance(c, hostBegin);
2393	++piecePointer;
2394	compressPointer = piecePointer;
2395	immediatelyAfterCompress = true;
2396	}
2397
2398	while (!c.atEnd()) {
2399	if (piecePointer == `8`)
2400	return WTF::nullopt;
2401	if (*c == `':'`) {
2402	if (compressPointer)
2403	return WTF::nullopt;
2404	advance(c, hostBegin);
2405	++piecePointer;
2406	compressPointer = piecePointer;
2407	immediatelyAfterCompress = true;
2408	if (previousValueWasZero)
2409	syntaxViolation(hostBegin);
2410	continue;
2411	}
2412	if (piecePointer == `6` \|\| (compressPointer && piecePointer < `6`)) {
2413	if (Optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) {
2414	if (compressPointer && piecePointer == `5`)
2415	return WTF::nullopt;
2416	syntaxViolation(hostBegin);
2417	address [piecePointer++] = ipv4Address.value() >> `16`;
2418	address [piecePointer++] = ipv4Address.value() & `0xFFFF`;
2419	c = { };
2420	break;
2421	}
2422	}
2423	uint16_t value = `0`;
2424	size_t length = `0`;
2425	bool leadingZeros = false;
2426	for (; length < `4`; length++) {
2427	if (c.atEnd())
2428	break;
2429	if (!isASCIIHexDigit(*c))
2430	break;
2431	if (isASCIIUpper(*c))
2432	syntaxViolation(hostBegin);
2433	if (*c == `'0'` && !length)
2434	leadingZeros = true;
2435	value = value * `0x10` + toASCIIHexValue(*c);
2436	advance(c, hostBegin);
2437	}
2438
2439	previousValueWasZero = !value;
2440	if (UNLIKELY((value && leadingZeros) \|\| (previousValueWasZero && (length > `1` \|\| immediatelyAfterCompress))))
2441	syntaxViolation(hostBegin);
2442
2443	address [piecePointer++] = value;
2444	if (c.atEnd())
2445	break;
2446	if (piecePointer == `8` \|\| *c != `':'`)
2447	return WTF::nullopt;
2448	advance(c, hostBegin);
2449	if (c.atEnd())
2450	syntaxViolation(hostBegin);
2451
2452	immediatelyAfterCompress = false;
2453	}
2454
2455	if (!c.atEnd())
2456	return WTF::nullopt;
2457
2458	if (compressPointer) {
2459	size_t swaps = piecePointer - compressPointer.value();
2460	piecePointer = `7`;
2461	while (swaps)
2462	std::swap(address [piecePointer--], address [compressPointer.value() + swaps-- - `1`]);
2463	} else if (piecePointer != `8`)
2464	return WTF::nullopt;
2465
2466	Optional<size_t> possibleCompressPointer = findLongestZeroSequence(address);
2467	if (possibleCompressPointer)
2468	possibleCompressPointer.value()++;
2469	if (UNLIKELY(compressPointer != possibleCompressPointer))
2470	syntaxViolation(hostBegin);
2471
2472	return address;
2473	}
2474
2475	template<typename CharacterType>
2476	URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2477	{
2478	LCharBuffer output;
2479	output.reserveInitialCapacity(length);
2480
2481	for (size_t i = `0`; i < length; ++i) {
2482	uint8_t byte = input[i];
2483	if (byte != `'%'`)
2484	output.uncheckedAppend(byte);
2485	else if (length > `2` && i < length - `2`) {
2486	if (isASCIIHexDigit(input[i + `1`]) && isASCIIHexDigit(input[i + `2`])) {
2487	syntaxViolation(iteratorForSyntaxViolationPosition);
2488	output.uncheckedAppend(toASCIIHexValue(input[i + `1`], input[i + `2`]));
2489	i += `2`;
2490	} else
2491	output.uncheckedAppend(byte);
2492	} else
2493	output.uncheckedAppend(byte);
2494	}
2495	return output;
2496	}
2497
2498	URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length)
2499	{
2500	LCharBuffer output;
2501	output.reserveInitialCapacity(length);
2502
2503	for (size_t i = `0`; i < length; ++i) {
2504	uint8_t byte = input[i];
2505	if (byte != `'%'`)
2506	output.uncheckedAppend(byte);
2507	else if (length > `2` && i < length - `2`) {
2508	if (isASCIIHexDigit(input[i + `1`]) && isASCIIHexDigit(input[i + `2`])) {
2509	output.uncheckedAppend(toASCIIHexValue(input[i + `1`], input[i + `2`]));
2510	i += `2`;
2511	} else
2512	output.uncheckedAppend(byte);
2513	} else
2514	output.uncheckedAppend(byte);
2515	}
2516	return output;
2517	}
2518
2519	template<typename CharacterType> Optional<URLParser::LCharBuffer> URLParser::domainToASCII(StringImpl& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition)
2520	{
2521	LCharBuffer ascii;
2522	if (domain.isAllASCII()) {
2523	size_t length = domain.length();
2524	if (domain.is8Bit()) {
2525	const LChar* characters = domain.characters8();
2526	ascii.reserveInitialCapacity(length);
2527	for (size_t i = `0`; i < length; ++i) {
2528	if (UNLIKELY(isASCIIUpper(characters[i])))
2529	syntaxViolation(iteratorForSyntaxViolationPosition);
2530	ascii.uncheckedAppend(toASCIILower(characters[i]));
2531	}
2532	} else {
2533	const UChar* characters = domain.characters16();
2534	ascii.reserveInitialCapacity(length);
2535	for (size_t i = `0`; i < length; ++i) {
2536	if (UNLIKELY(isASCIIUpper(characters[i])))
2537	syntaxViolation(iteratorForSyntaxViolationPosition);
2538	ascii.uncheckedAppend(toASCIILower(characters[i]));
2539	}
2540	}
2541	return ascii;
2542	}
2543
2544	const size_t maxDomainLength = `64`;
2545	UChar hostnameBuffer[maxDomainLength];
2546	UErrorCode error = U_ZERO_ERROR;
2547	UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
2548	int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView (domain).upconvertedCharacters(), domain.length(), hostnameBuffer, maxDomainLength, &processingDetails, &error);
2549
2550	if (U_SUCCESS(error) && !processingDetails.errors) {
2551	#if ASSERT_DISABLED
2552	UNUSED_PARAM(numCharactersConverted);
2553	#else
2554	for (int32_t i = `0`; i < numCharactersConverted; ++i) {
2555	ASSERT(isASCII(hostnameBuffer[i]));
2556	ASSERT(!isASCIIUpper(hostnameBuffer[i]));
2557	}
2558	#endif
2559	ascii.append(hostnameBuffer, numCharactersConverted);
2560	if (domain != StringView (ascii.data(), ascii.size()))
2561	syntaxViolation(iteratorForSyntaxViolationPosition);
2562	return ascii;
2563	}
2564	return WTF::nullopt;
2565	}
2566
2567	bool URLParser::hasForbiddenHostCodePoint(const URLParser::LCharBuffer& asciiDomain)
2568	{
2569	for (size_t i = `0`; i < asciiDomain.size(); ++i) {
2570	if (isForbiddenHostCodePoint(asciiDomain [i]))
2571	return true;
2572	}
2573	return false;
2574	}
2575
2576	template<typename CharacterType>
2577	bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
2578	{
2579	ASSERT(*iterator == `':'`);
2580	auto colonIterator = iterator;
2581	advance(iterator, colonIterator);
2582	uint32_t port = `0`;
2583	if (UNLIKELY(iterator.atEnd())) {
2584	unsigned portLength = currentPosition(colonIterator) - m_url.m_hostEnd;
2585	RELEASE_ASSERT(portLength <= URL::maxPortLength);
2586	m_url.m_portLength = portLength;
2587	syntaxViolation(colonIterator);
2588	return true;
2589	}
2590	size_t digitCount = `0`;
2591	bool leadingZeros = false;
2592	for (; !iterator.atEnd(); ++iterator) {
2593	if (UNLIKELY(isTabOrNewline(*iterator))) {
2594	syntaxViolation(colonIterator);
2595	continue;
2596	}
2597	if (isASCIIDigit(*iterator)) {
2598	if (*iterator == `'0'` && !digitCount)
2599	leadingZeros = true;
2600	++digitCount;
2601	port = port * `10` + *iterator - `'0'`;
2602	if (port > std::numeric_limits<uint16_t>::max())
2603	return false;
2604	} else
2605	return false;
2606	}
2607
2608	if (port && leadingZeros)
2609	syntaxViolation(colonIterator);
2610
2611	if (!port && digitCount > `1`)
2612	syntaxViolation(colonIterator);
2613
2614	ASSERT(port == static_cast<uint16_t>(port));
2615	if (UNLIKELY(defaultPortForProtocol(parsedDataView(`0`, m_url.m_schemeEnd)) == static_cast<uint16_t>(port)))
2616	syntaxViolation(colonIterator);
2617	else {
2618	appendToASCIIBuffer(`':'`);
2619	ASSERT(port <= std::numeric_limits<uint16_t>::max());
2620	appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port));
2621	}
2622
2623	unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2624	RELEASE_ASSERT(portLength <= URL::maxPortLength);
2625	m_url.m_portLength = portLength;
2626	return true;
2627	}
2628
2629	template<typename CharacterType>
2630	bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
2631	{
2632	if (iterator.atEnd())
2633	return false;
2634	if (*iterator == `':'`)
2635	return false;
2636	if (*iterator == `'['`) {
2637	auto ipv6End = iterator;
2638	while (!ipv6End.atEnd() && *ipv6End != `']'`)
2639	++ipv6End;
2640	if (ipv6End.atEnd())
2641	return false;
2642	if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) {
2643	serializeIPv6(address.value());
2644	if (!ipv6End.atEnd()) {
2645	advance(ipv6End);
2646	m_url.m_hostEnd = currentPosition(ipv6End);
2647	if (!ipv6End.atEnd() && *ipv6End == `':'`)
2648	return parsePort(ipv6End);
2649	m_url.m_portLength = `0`;
2650	return ipv6End.atEnd();
2651	}
2652	m_url.m_hostEnd = currentPosition(ipv6End);
2653	return true;
2654	}
2655	return false;
2656	}
2657
2658	if (!m_urlIsSpecial) {
2659	for (; !iterator.atEnd(); ++iterator) {
2660	if (UNLIKELY(isTabOrNewline(*iterator))) {
2661	syntaxViolation(iterator);
2662	continue;
2663	}
2664	if (*iterator == `':'`)
2665	break;
2666	if (UNLIKELY(isForbiddenHostCodePoint(iterator) && iterator != `'%'`))
2667	return false;
2668	utf8PercentEncode<isInSimpleEncodeSet>(iterator);
2669	}
2670	m_url.m_hostEnd = currentPosition(iterator);
2671	if (iterator.atEnd()) {
2672	m_url.m_portLength = `0`;
2673	return true;
2674	}
2675	return parsePort(iterator);
2676	}
2677
2678	if (LIKELY(!m_hostHasPercentOrNonASCII)) {
2679	auto hostIterator = iterator;
2680	for (; !iterator.atEnd(); ++iterator) {
2681	if (isTabOrNewline(*iterator))
2682	continue;
2683	if (*iterator == `':'`)
2684	break;
2685	if (isForbiddenHostCodePoint(*iterator))
2686	return false;
2687	}
2688	auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator));
2689	if (address) {
2690	serializeIPv4(address.value());
2691	m_url.m_hostEnd = currentPosition(iterator);
2692	if (iterator.atEnd()) {
2693	m_url.m_portLength = `0`;
2694	return true;
2695	}
2696	return parsePort(iterator);
2697	}
2698	if (address.error() == IPv4ParsingError::Failure)
2699	return false;
2700	for (; hostIterator != iterator; ++hostIterator) {
2701	if (UNLIKELY(isTabOrNewline(*hostIterator))) {
2702	syntaxViolation(hostIterator);
2703	continue;
2704	}
2705	if (UNLIKELY(isASCIIUpper(*hostIterator)))
2706	syntaxViolation(hostIterator);
2707	appendToASCIIBuffer(toASCIILower(*hostIterator));
2708	}
2709	m_url.m_hostEnd = currentPosition(iterator);
2710	if (!hostIterator.atEnd())
2711	return parsePort(hostIterator);
2712	unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd;
2713	RELEASE_ASSERT(portLength <= URL::maxPortLength);
2714	m_url.m_portLength = portLength;
2715	return true;
2716	}
2717
2718	const auto hostBegin = iterator;
2719
2720	LCharBuffer utf8Encoded;
2721	for (; !iterator.atEnd(); ++iterator) {
2722	if (UNLIKELY(isTabOrNewline(*iterator))) {
2723	syntaxViolation(hostBegin);
2724	continue;
2725	}
2726	if (*iterator == `':'`)
2727	break;
2728	if (UNLIKELY(!isASCII(*iterator)))
2729	syntaxViolation(hostBegin);
2730
2731	if (!U_IS_UNICODE_CHAR(*iterator))
2732	return false;
2733	uint8_t buffer[U8_MAX_LENGTH];
2734	int32_t offset = `0`;
2735	U8_APPEND_UNSAFE(buffer, offset, *iterator);
2736	utf8Encoded.append(buffer, offset);
2737	}
2738	LCharBuffer percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin);
2739	String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2740	if (domain.isNull())
2741	return false;
2742	if (domain != StringView (percentDecoded.data(), percentDecoded.size()))
2743	syntaxViolation(hostBegin);
2744	auto asciiDomain = domainToASCII(*domain.impl(), hostBegin);
2745	if (!asciiDomain \|\| hasForbiddenHostCodePoint(asciiDomain.value()))
2746	return false;
2747	LCharBuffer& asciiDomainValue = asciiDomain.value();
2748	const LChar* asciiDomainCharacters = asciiDomainValue.data();
2749
2750	auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()));
2751	if (address) {
2752	serializeIPv4(address.value());
2753	m_url.m_hostEnd = currentPosition(iterator);
2754	if (iterator.atEnd()) {
2755	m_url.m_portLength = `0`;
2756	return true;
2757	}
2758	return parsePort(iterator);
2759	}
2760	if (address.error() == IPv4ParsingError::Failure)
2761	return false;
2762
2763	appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size());
2764	m_url.m_hostEnd = currentPosition(iterator);
2765	if (!iterator.atEnd())
2766	return parsePort(iterator);
2767	m_url.m_portLength = `0`;
2768	return true;
2769	}
2770
2771	Optional<String> URLParser::formURLDecode(StringView input)
2772	{
2773	auto utf8 = input.utf8(StrictConversion);
2774	if (utf8.isNull())
2775	return WTF::nullopt;
2776	auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
2777	return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
2778	}
2779
2780	// https://url.spec.whatwg.org/#concept-urlencoded-parser
2781	auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
2782	{
2783	URLEncodedForm output;
2784	for (StringView bytes : input.split(`'&'`)) {
2785	auto equalIndex = bytes.find(`'='`);
2786	if (equalIndex == notFound) {
2787	auto name = formURLDecode(bytes.toString().replace(`'+'`, `0x20`));
2788	if (name)
2789	output.append({ name.value(), emptyString() });
2790	} else {
2791	auto name = formURLDecode(bytes.substring(`0`, equalIndex).toString().replace(`'+'`, `0x20`));
2792	auto value = formURLDecode(bytes.substring(equalIndex + `1`).toString().replace(`'+'`, `0x20`));
2793	if (name && value)
2794	output.append({ name.value(), value.value() });
2795	}
2796	}
2797	return output;
2798	}
2799
2800	static void serializeURLEncodedForm(const String& input, Vector<LChar>& output)
2801	{
2802	auto utf8 = input.utf8(StrictConversion);
2803	const char* data = utf8.data();
2804	for (size_t i = `0`; i < utf8.length(); ++i) {
2805	const char byte = data[i];
2806	if (byte == `0x20`)
2807	output.append(`0x2B`);
2808	else if (byte == `0x2A`
2809	\|\| byte == `0x2D`
2810	\|\| byte == `0x2E`
2811	\|\| (byte >= `0x30` && byte <= `0x39`)
2812	\|\| (byte >= `0x41` && byte <= `0x5A`)
2813	\|\| byte == `0x5F`
2814	\|\| (byte >= `0x61` && byte <= `0x7A`)) // FIXME: Put these in the characterClassTable to avoid branches.
2815	output.append(byte);
2816	else
2817	percentEncodeByte(byte, output);
2818	}
2819	}
2820
2821	String URLParser::serialize(const URLEncodedForm& tuples)
2822	{
2823	if (tuples.isEmpty())
2824	return { };
2825
2826	Vector<LChar> output;
2827	for (auto& tuple : tuples) {
2828	if (!output.isEmpty())
2829	output.append(`'&'`);
2830	serializeURLEncodedForm(tuple.key, output);
2831	output.append(`'='`);
2832	serializeURLEncodedForm(tuple.value, output);
2833	}
2834	return String::adopt(WTFMove(output));
2835	}
2836
2837	const UIDNA& URLParser::internationalDomainNameTranscoder()
2838	{
2839	static UIDNA* encoder;
2840	static std::once_flag onceFlag;
2841	std::call_once(onceFlag, [] {
2842	UErrorCode error = U_ZERO_ERROR;
2843	encoder = uidna_openUTS46(UIDNA_CHECK_BIDI \| UIDNA_CHECK_CONTEXTJ \| UIDNA_NONTRANSITIONAL_TO_UNICODE \| UIDNA_NONTRANSITIONAL_TO_ASCII, &error);
2844	RELEASE_ASSERT(U_SUCCESS(error));
2845	RELEASE_ASSERT(encoder);
2846	});
2847	return *encoder;
2848	}
2849
2850	bool URLParser::allValuesEqual(const URL& a, const URL& b)
2851	{
2852	URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
2853	a.m_isValid,
2854	a.m_cannotBeABaseURL,
2855	a.m_protocolIsInHTTPFamily,
2856	a.m_schemeEnd,
2857	a.m_userStart,
2858	a.m_userEnd,
2859	a.m_passwordEnd,
2860	a.m_hostEnd,
2861	a.m_hostEnd + a.m_portLength,
2862	a.m_pathAfterLastSlash,
2863	a.m_pathEnd,
2864	a.m_queryEnd,
2865	a.m_string.utf8().data(),
2866	b.m_isValid,
2867	b.m_cannotBeABaseURL,
2868	b.m_protocolIsInHTTPFamily,
2869	b.m_schemeEnd,
2870	b.m_userStart,
2871	b.m_userEnd,
2872	b.m_passwordEnd,
2873	b.m_hostEnd,
2874	b.m_hostEnd + b.m_portLength,
2875	b.m_pathAfterLastSlash,
2876	b.m_pathEnd,
2877	b.m_queryEnd,
2878	b.m_string.utf8().data());
2879
2880	return a.m_string == b.m_string
2881	&& a.m_isValid == b.m_isValid
2882	&& a.m_cannotBeABaseURL == b.m_cannotBeABaseURL
2883	&& a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily
2884	&& a.m_schemeEnd == b.m_schemeEnd
2885	&& a.m_userStart == b.m_userStart
2886	&& a.m_userEnd == b.m_userEnd
2887	&& a.m_passwordEnd == b.m_passwordEnd
2888	&& a.m_hostEnd == b.m_hostEnd
2889	&& a.m_portLength == b.m_portLength
2890	&& a.m_pathAfterLastSlash == b.m_pathAfterLastSlash
2891	&& a.m_pathEnd == b.m_pathEnd
2892	&& a.m_queryEnd == b.m_queryEnd;
2893	}
2894
2895	bool URLParser::internalValuesConsistent(const URL& url)
2896	{
2897	return url.m_schemeEnd <= url.m_userStart
2898	&& url.m_userStart <= url.m_userEnd
2899	&& url.m_userEnd <= url.m_passwordEnd
2900	&& url.m_passwordEnd <= url.m_hostEnd
2901	&& url.m_hostEnd + url.m_portLength <= url.m_pathAfterLastSlash
2902	&& url.m_pathAfterLastSlash <= url.m_pathEnd
2903	&& url.m_pathEnd <= url.m_queryEnd
2904	&& url.m_queryEnd <= url.m_string.length();
2905	}
2906
2907	} // namespace WTF
2908

Browse the source code of jsc/Source/WTF/wtf/URLParser.cpp