utf8.h source code [include/unicode/utf8.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 1999-2015, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: utf8.h
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 1999sep13
16	* created by: Markus W. Scherer
17	*/
18
19	/**
20	* \file
21	* \brief C API: 8-bit Unicode handling macros
22	*
23	* This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
24	*
25	* For more information see utf.h and the ICU User Guide Strings chapter
26	* (http://userguide.icu-project.org/strings).
27	*
28	* <em>Usage:</em>
29	* ICU coding guidelines for if() statements should be followed when using these macros.
30	* Compound statements (curly braces {}) must be used for if-else-while...
31	* bodies and all macro statements should be terminated with semicolon.
32	*/
33
34	#ifndef __UTF8_H__
35	#define __UTF8_H__
36
37	#include "unicode/umachine.h"
38	#ifndef __UTF_H__
39	# include "unicode/utf.h"
40	#endif
41
42	/ internal definitions ----------------------------------------------------- /
43
44	/**
45	* Counts the trail bytes for a UTF-8 lead byte.
46	* Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
47	* leadByte might be evaluated multiple times.
48	*
49	* This is internal since it is not meant to be called directly by external clients;
50	* however it is called by public macros in this file and thus must remain stable.
51	*
52	* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
53	* @internal
54	*/
55	#define U8_COUNT_TRAIL_BYTES(leadByte) \
56	(U8_IS_LEAD(leadByte) ? \
57	((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
58
59	/**
60	* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
61	* Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
62	* leadByte might be evaluated multiple times.
63	*
64	* This is internal since it is not meant to be called directly by external clients;
65	* however it is called by public macros in this file and thus must remain stable.
66	*
67	* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
68	* @internal
69	*/
70	#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
71	(((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
72
73	/**
74	* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
75	*
76	* This is internal since it is not meant to be called directly by external clients;
77	* however it is called by public macros in this file and thus must remain stable.
78	* @internal
79	*/
80	#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
81
82	/**
83	* Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
84	* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
85	* Lead byte E0..EF bits 3..0 are used as byte index,
86	* first trail byte bits 7..5 are used as bit index into that byte.
87	* @see U8_IS_VALID_LEAD3_AND_T1
88	* @internal
89	*/
90	#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
91
92	/**
93	* Internal 3-byte UTF-8 validity check.
94	* Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
95	* @internal
96	*/
97	#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
98
99	/**
100	* Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
101	* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
102	* First trail byte bits 7..4 are used as byte index,
103	* lead byte F0..F4 bits 2..0 are used as bit index into that byte.
104	* @see U8_IS_VALID_LEAD4_AND_T1
105	* @internal
106	*/
107	#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
108
109	/**
110	* Internal 4-byte UTF-8 validity check.
111	* Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
112	* @internal
113	*/
114	#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
115
116	/**
117	* Function for handling "next code point" with error-checking.
118	*
119	* This is internal since it is not meant to be called directly by external clients;
120	* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
121	* file and thus must remain stable, and should not be hidden when other internal
122	* functions are hidden (otherwise public macros would fail to compile).
123	* @internal
124	*/
125	U_STABLE UChar32 U_EXPORT2
126	utf8_nextCharSafeBody(const uint8_t s, int32_t pi, int32_t length, UChar32 c, UBool strict);
127
128	/**
129	* Function for handling "append code point" with error-checking.
130	*
131	* This is internal since it is not meant to be called directly by external clients;
132	* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
133	* file and thus must remain stable, and should not be hidden when other internal
134	* functions are hidden (otherwise public macros would fail to compile).
135	* @internal
136	*/
137	U_STABLE int32_t U_EXPORT2
138	utf8_appendCharSafeBody(uint8_t s, int32_t i, int32_t length, UChar32 c, UBool pIsError);
139
140	/**
141	* Function for handling "previous code point" with error-checking.
142	*
143	* This is internal since it is not meant to be called directly by external clients;
144	* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
145	* file and thus must remain stable, and should not be hidden when other internal
146	* functions are hidden (otherwise public macros would fail to compile).
147	* @internal
148	*/
149	U_STABLE UChar32 U_EXPORT2
150	utf8_prevCharSafeBody(const uint8_t s, int32_t start, int32_t pi, UChar32 c, UBool strict);
151
152	/**
153	* Function for handling "skip backward one code point" with error-checking.
154	*
155	* This is internal since it is not meant to be called directly by external clients;
156	* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
157	* file and thus must remain stable, and should not be hidden when other internal
158	* functions are hidden (otherwise public macros would fail to compile).
159	* @internal
160	*/
161	U_STABLE int32_t U_EXPORT2
162	utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
163
164	/ single-code point definitions -------------------------------------------- /
165
166	/**
167	* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
168	* @param c 8-bit code unit (byte)
169	* @return TRUE or FALSE
170	* @stable ICU 2.4
171	*/
172	#define U8_IS_SINGLE(c) (((c)&0x80)==0)
173
174	/**
175	* Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
176	* @param c 8-bit code unit (byte)
177	* @return TRUE or FALSE
178	* @stable ICU 2.4
179	*/
180	#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
181	// 0x32=0xf4-0xc2
182
183	/**
184	* Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
185	* @param c 8-bit code unit (byte)
186	* @return TRUE or FALSE
187	* @stable ICU 2.4
188	*/
189	#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
190
191	/**
192	* How many code units (bytes) are used for the UTF-8 encoding
193	* of this Unicode code point?
194	* @param c 32-bit code point
195	* @return 1..4, or 0 if c is a surrogate or not a Unicode code point
196	* @stable ICU 2.4
197	*/
198	#define U8_LENGTH(c) \
199	((uint32_t)(c)<=0x7f ? 1 : \
200	((uint32_t)(c)<=0x7ff ? 2 : \
201	((uint32_t)(c)<=0xd7ff ? 3 : \
202	((uint32_t)(c)<=0xdfff \|\| (uint32_t)(c)>0x10ffff ? 0 : \
203	((uint32_t)(c)<=0xffff ? 3 : 4)\
204	) \
205	) \
206	) \
207	)
208
209	/**
210	* The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
211	* @return 4
212	* @stable ICU 2.4
213	*/
214	#define U8_MAX_LENGTH 4
215
216	/**
217	* Get a code point from a string at a random-access offset,
218	* without changing the offset.
219	* The offset may point to either the lead byte or one of the trail bytes
220	* for a code point, in which case the macro will read all of the bytes
221	* for the code point.
222	* The result is undefined if the offset points to an illegal UTF-8
223	* byte sequence.
224	* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
225	*
226	* @param s const uint8_t * string
227	* @param i string offset
228	* @param c output UChar32 variable
229	* @see U8_GET
230	* @stable ICU 2.4
231	*/
232	#define U8_GET_UNSAFE(s, i, c) { \
233	int32_t _u8_get_unsafe_index=(int32_t)(i); \
234	U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
235	U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
236	}
237
238	/**
239	* Get a code point from a string at a random-access offset,
240	* without changing the offset.
241	* The offset may point to either the lead byte or one of the trail bytes
242	* for a code point, in which case the macro will read all of the bytes
243	* for the code point.
244	*
245	* The length can be negative for a NUL-terminated string.
246	*
247	* If the offset points to an illegal UTF-8 byte sequence, then
248	* c is set to a negative value.
249	* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
250	*
251	* @param s const uint8_t * string
252	* @param start int32_t starting string offset
253	* @param i int32_t string offset, must be start<=i<length
254	* @param length int32_t string length
255	* @param c output UChar32 variable, set to <0 in case of an error
256	* @see U8_GET_UNSAFE
257	* @stable ICU 2.4
258	*/
259	#define U8_GET(s, start, i, length, c) { \
260	int32_t _u8_get_index=(i); \
261	U8_SET_CP_START(s, start, _u8_get_index); \
262	U8_NEXT(s, _u8_get_index, length, c); \
263	}
264
265	/**
266	* Get a code point from a string at a random-access offset,
267	* without changing the offset.
268	* The offset may point to either the lead byte or one of the trail bytes
269	* for a code point, in which case the macro will read all of the bytes
270	* for the code point.
271	*
272	* The length can be negative for a NUL-terminated string.
273	*
274	* If the offset points to an illegal UTF-8 byte sequence, then
275	* c is set to U+FFFD.
276	* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
277	*
278	* This macro does not distinguish between a real U+FFFD in the text
279	* and U+FFFD returned for an ill-formed sequence.
280	* Use U8_GET() if that distinction is important.
281	*
282	* @param s const uint8_t * string
283	* @param start int32_t starting string offset
284	* @param i int32_t string offset, must be start<=i<length
285	* @param length int32_t string length
286	* @param c output UChar32 variable, set to U+FFFD in case of an error
287	* @see U8_GET
288	* @stable ICU 51
289	*/
290	#define U8_GET_OR_FFFD(s, start, i, length, c) { \
291	int32_t _u8_get_index=(i); \
292	U8_SET_CP_START(s, start, _u8_get_index); \
293	U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
294	}
295
296	/ definitions with forward iteration --------------------------------------- /
297
298	/**
299	* Get a code point from a string at a code point boundary offset,
300	* and advance the offset to the next code point boundary.
301	* (Post-incrementing forward iteration.)
302	* "Unsafe" macro, assumes well-formed UTF-8.
303	*
304	* The offset may point to the lead byte of a multi-byte sequence,
305	* in which case the macro will read the whole sequence.
306	* The result is undefined if the offset points to a trail byte
307	* or an illegal UTF-8 sequence.
308	*
309	* @param s const uint8_t * string
310	* @param i string offset
311	* @param c output UChar32 variable
312	* @see U8_NEXT
313	* @stable ICU 2.4
314	*/
315	#define U8_NEXT_UNSAFE(s, i, c) { \
316	(c)=(uint8_t)(s)[(i)++]; \
317	if(!U8_IS_SINGLE(c)) { \
318	if((c)<0xe0) { \
319	(c)=(((c)&0x1f)<<6)\|((s)[(i)++]&0x3f); \
320	} else if((c)<0xf0) { \
321	/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
322	(c)=(UChar)(((c)<<12)\|(((s)[i]&0x3f)<<6)\|((s)[(i)+1]&0x3f)); \
323	(i)+=2; \
324	} else { \
325	(c)=(((c)&7)<<18)\|(((s)[i]&0x3f)<<12)\|(((s)[(i)+1]&0x3f)<<6)\|((s)[(i)+2]&0x3f); \
326	(i)+=3; \
327	} \
328	} \
329	}
330
331	/**
332	* Get a code point from a string at a code point boundary offset,
333	* and advance the offset to the next code point boundary.
334	* (Post-incrementing forward iteration.)
335	* "Safe" macro, checks for illegal sequences and for string boundaries.
336	*
337	* The length can be negative for a NUL-terminated string.
338	*
339	* The offset may point to the lead byte of a multi-byte sequence,
340	* in which case the macro will read the whole sequence.
341	* If the offset points to a trail byte or an illegal UTF-8 sequence, then
342	* c is set to a negative value.
343	*
344	* @param s const uint8_t * string
345	* @param i int32_t string offset, must be i<length
346	* @param length int32_t string length
347	* @param c output UChar32 variable, set to <0 in case of an error
348	* @see U8_NEXT_UNSAFE
349	* @stable ICU 2.4
350	*/
351	#define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)
352
353	/**
354	* Get a code point from a string at a code point boundary offset,
355	* and advance the offset to the next code point boundary.
356	* (Post-incrementing forward iteration.)
357	* "Safe" macro, checks for illegal sequences and for string boundaries.
358	*
359	* The length can be negative for a NUL-terminated string.
360	*
361	* The offset may point to the lead byte of a multi-byte sequence,
362	* in which case the macro will read the whole sequence.
363	* If the offset points to a trail byte or an illegal UTF-8 sequence, then
364	* c is set to U+FFFD.
365	*
366	* This macro does not distinguish between a real U+FFFD in the text
367	* and U+FFFD returned for an ill-formed sequence.
368	* Use U8_NEXT() if that distinction is important.
369	*
370	* @param s const uint8_t * string
371	* @param i int32_t string offset, must be i<length
372	* @param length int32_t string length
373	* @param c output UChar32 variable, set to U+FFFD in case of an error
374	* @see U8_NEXT
375	* @stable ICU 51
376	*/
377	#define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)
378
379	/* @internal /
380	#define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) { \
381	(c)=(uint8_t)(s)[(i)++]; \
382	if(!U8_IS_SINGLE(c)) { \
383	uint8_t __t = 0; \
384	if((i)!=(length) && \
385	/* fetch/validate/assemble all but last trail byte */ \
386	((c)>=0xe0 ? \
387	((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \
388	U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
389	(__t&=0x3f, 1) \
390	: /* U+10000..U+10FFFF */ \
391	((c)-=0xf0)<=4 && \
392	U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
393	((c)=((c)<<6)\|(__t&0x3f), ++(i)!=(length)) && \
394	(__t=(s)[i]-0x80)<=0x3f) && \
395	/* valid second-to-last trail byte */ \
396	((c)=((c)<<6)\|__t, ++(i)!=(length)) \
397	: /* U+0080..U+07FF */ \
398	(c)>=0xc2 && ((c)&=0x1f, 1)) && \
399	/* last trail byte */ \
400	(__t=(s)[i]-0x80)<=0x3f && \
401	((c)=((c)<<6)\|__t, ++(i), 1)) { \
402	} else { \
403	(c)=(sub); /* ill-formed*/ \
404	} \
405	} \
406	}
407
408	/**
409	* Append a code point to a string, overwriting 1 to 4 bytes.
410	* The offset points to the current end of the string contents
411	* and is advanced (post-increment).
412	* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
413	* Otherwise, the result is undefined.
414	*
415	* @param s const uint8_t * string buffer
416	* @param i string offset
417	* @param c code point to append
418	* @see U8_APPEND
419	* @stable ICU 2.4
420	*/
421	#define U8_APPEND_UNSAFE(s, i, c) { \
422	uint32_t __uc=(c); \
423	if(__uc<=0x7f) { \
424	(s)[(i)++]=(uint8_t)__uc; \
425	} else { \
426	if(__uc<=0x7ff) { \
427	(s)[(i)++]=(uint8_t)((__uc>>6)\|0xc0); \
428	} else { \
429	if(__uc<=0xffff) { \
430	(s)[(i)++]=(uint8_t)((__uc>>12)\|0xe0); \
431	} else { \
432	(s)[(i)++]=(uint8_t)((__uc>>18)\|0xf0); \
433	(s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)\|0x80); \
434	} \
435	(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)\|0x80); \
436	} \
437	(s)[(i)++]=(uint8_t)((__uc&0x3f)\|0x80); \
438	} \
439	}
440
441	/**
442	* Append a code point to a string, overwriting 1 to 4 bytes.
443	* The offset points to the current end of the string contents
444	* and is advanced (post-increment).
445	* "Safe" macro, checks for a valid code point.
446	* If a non-ASCII code point is written, checks for sufficient space in the string.
447	* If the code point is not valid or trail bytes do not fit,
448	* then isError is set to TRUE.
449	*
450	* @param s const uint8_t * string buffer
451	* @param i int32_t string offset, must be i<capacity
452	* @param capacity int32_t size of the string buffer
453	* @param c UChar32 code point to append
454	* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
455	* @see U8_APPEND_UNSAFE
456	* @stable ICU 2.4
457	*/
458	#define U8_APPEND(s, i, capacity, c, isError) { \
459	uint32_t __uc=(c); \
460	if(__uc<=0x7f) { \
461	(s)[(i)++]=(uint8_t)__uc; \
462	} else if(__uc<=0x7ff && (i)+1<(capacity)) { \
463	(s)[(i)++]=(uint8_t)((__uc>>6)\|0xc0); \
464	(s)[(i)++]=(uint8_t)((__uc&0x3f)\|0x80); \
465	} else if((__uc<=0xd7ff \|\| (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
466	(s)[(i)++]=(uint8_t)((__uc>>12)\|0xe0); \
467	(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)\|0x80); \
468	(s)[(i)++]=(uint8_t)((__uc&0x3f)\|0x80); \
469	} else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
470	(s)[(i)++]=(uint8_t)((__uc>>18)\|0xf0); \
471	(s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)\|0x80); \
472	(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)\|0x80); \
473	(s)[(i)++]=(uint8_t)((__uc&0x3f)\|0x80); \
474	} else { \
475	(isError)=TRUE; \
476	} \
477	}
478
479	/**
480	* Advance the string offset from one code point boundary to the next.
481	* (Post-incrementing iteration.)
482	* "Unsafe" macro, assumes well-formed UTF-8.
483	*
484	* @param s const uint8_t * string
485	* @param i string offset
486	* @see U8_FWD_1
487	* @stable ICU 2.4
488	*/
489	#define U8_FWD_1_UNSAFE(s, i) { \
490	(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
491	}
492
493	/**
494	* Advance the string offset from one code point boundary to the next.
495	* (Post-incrementing iteration.)
496	* "Safe" macro, checks for illegal sequences and for string boundaries.
497	*
498	* The length can be negative for a NUL-terminated string.
499	*
500	* @param s const uint8_t * string
501	* @param i int32_t string offset, must be i<length
502	* @param length int32_t string length
503	* @see U8_FWD_1_UNSAFE
504	* @stable ICU 2.4
505	*/
506	#define U8_FWD_1(s, i, length) { \
507	uint8_t __b=(s)[(i)++]; \
508	if(U8_IS_LEAD(__b) && (i)!=(length)) { \
509	uint8_t __t1=(s)[i]; \
510	if((0xe0<=__b && __b<0xf0)) { \
511	if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
512	++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
513	++(i); \
514	} \
515	} else if(__b<0xe0) { \
516	if(U8_IS_TRAIL(__t1)) { \
517	++(i); \
518	} \
519	} else /* c>=0xf0 */ { \
520	if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
521	++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
522	++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
523	++(i); \
524	} \
525	} \
526	} \
527	}
528
529	/**
530	* Advance the string offset from one code point boundary to the n-th next one,
531	* i.e., move forward by n code points.
532	* (Post-incrementing iteration.)
533	* "Unsafe" macro, assumes well-formed UTF-8.
534	*
535	* @param s const uint8_t * string
536	* @param i string offset
537	* @param n number of code points to skip
538	* @see U8_FWD_N
539	* @stable ICU 2.4
540	*/
541	#define U8_FWD_N_UNSAFE(s, i, n) { \
542	int32_t __N=(n); \
543	while(__N>0) { \
544	U8_FWD_1_UNSAFE(s, i); \
545	--__N; \
546	} \
547	}
548
549	/**
550	* Advance the string offset from one code point boundary to the n-th next one,
551	* i.e., move forward by n code points.
552	* (Post-incrementing iteration.)
553	* "Safe" macro, checks for illegal sequences and for string boundaries.
554	*
555	* The length can be negative for a NUL-terminated string.
556	*
557	* @param s const uint8_t * string
558	* @param i int32_t string offset, must be i<length
559	* @param length int32_t string length
560	* @param n number of code points to skip
561	* @see U8_FWD_N_UNSAFE
562	* @stable ICU 2.4
563	*/
564	#define U8_FWD_N(s, i, length, n) { \
565	int32_t __N=(n); \
566	while(__N>0 && ((i)<(length) \|\| ((length)<0 && (s)[i]!=0))) { \
567	U8_FWD_1(s, i, length); \
568	--__N; \
569	} \
570	}
571
572	/**
573	* Adjust a random-access offset to a code point boundary
574	* at the start of a code point.
575	* If the offset points to a UTF-8 trail byte,
576	* then the offset is moved backward to the corresponding lead byte.
577	* Otherwise, it is not modified.
578	* "Unsafe" macro, assumes well-formed UTF-8.
579	*
580	* @param s const uint8_t * string
581	* @param i string offset
582	* @see U8_SET_CP_START
583	* @stable ICU 2.4
584	*/
585	#define U8_SET_CP_START_UNSAFE(s, i) { \
586	while(U8_IS_TRAIL((s)[i])) { --(i); } \
587	}
588
589	/**
590	* Adjust a random-access offset to a code point boundary
591	* at the start of a code point.
592	* If the offset points to a UTF-8 trail byte,
593	* then the offset is moved backward to the corresponding lead byte.
594	* Otherwise, it is not modified.
595	*
596	* "Safe" macro, checks for illegal sequences and for string boundaries.
597	* Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
598	*
599	* @param s const uint8_t * string
600	* @param start int32_t starting string offset (usually 0)
601	* @param i int32_t string offset, must be start<=i
602	* @see U8_SET_CP_START_UNSAFE
603	* @see U8_TRUNCATE_IF_INCOMPLETE
604	* @stable ICU 2.4
605	*/
606	#define U8_SET_CP_START(s, start, i) { \
607	if(U8_IS_TRAIL((s)[(i)])) { \
608	(i)=utf8_back1SafeBody(s, start, (i)); \
609	} \
610	}
611
612	/**
613	* If the string ends with a UTF-8 byte sequence that is valid so far
614	* but incomplete, then reduce the length of the string to end before
615	* the lead byte of that incomplete sequence.
616	* For example, if the string ends with E1 80, the length is reduced by 2.
617	*
618	* In all other cases (the string ends with a complete sequence, or it is not
619	* possible for any further trail byte to extend the trailing sequence)
620	* the length remains unchanged.
621	*
622	* Useful for processing text split across multiple buffers
623	* (save the incomplete sequence for later)
624	* and for optimizing iteration
625	* (check for string length only once per character).
626	*
627	* "Safe" macro, checks for illegal sequences and for string boundaries.
628	* Unlike U8_SET_CP_START(), this macro never reads s[length].
629	*
630	* (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
631	*
632	* @param s const uint8_t * string
633	* @param start int32_t starting string offset (usually 0)
634	* @param length int32_t string length (usually start<=length)
635	* @see U8_SET_CP_START
636	* @stable ICU 61
637	*/
638	#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) \
639	if((length)>(start)) { \
640	uint8_t __b1=s[(length)-1]; \
641	if(U8_IS_SINGLE(__b1)) { \
642	/* common ASCII character */ \
643	} else if(U8_IS_LEAD(__b1)) { \
644	--(length); \
645	} else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
646	uint8_t __b2=s[(length)-2]; \
647	if(0xe0<=__b2 && __b2<=0xf4) { \
648	if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
649	U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
650	(length)-=2; \
651	} \
652	} else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
653	uint8_t __b3=s[(length)-3]; \
654	if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
655	(length)-=3; \
656	} \
657	} \
658	} \
659	}
660
661	/ definitions with backward iteration -------------------------------------- /
662
663	/**
664	* Move the string offset from one code point boundary to the previous one
665	* and get the code point between them.
666	* (Pre-decrementing backward iteration.)
667	* "Unsafe" macro, assumes well-formed UTF-8.
668	*
669	* The input offset may be the same as the string length.
670	* If the offset is behind a multi-byte sequence, then the macro will read
671	* the whole sequence.
672	* If the offset is behind a lead byte, then that itself
673	* will be returned as the code point.
674	* The result is undefined if the offset is behind an illegal UTF-8 sequence.
675	*
676	* @param s const uint8_t * string
677	* @param i string offset
678	* @param c output UChar32 variable
679	* @see U8_PREV
680	* @stable ICU 2.4
681	*/
682	#define U8_PREV_UNSAFE(s, i, c) { \
683	(c)=(uint8_t)(s)[--(i)]; \
684	if(U8_IS_TRAIL(c)) { \
685	uint8_t __b, __count=1, __shift=6; \
686	\
687	/* c is a trail byte */ \
688	(c)&=0x3f; \
689	for(;;) { \
690	__b=(s)[--(i)]; \
691	if(__b>=0xc0) { \
692	U8_MASK_LEAD_BYTE(__b, __count); \
693	(c)\|=(UChar32)__b<<__shift; \
694	break; \
695	} else { \
696	(c)\|=(UChar32)(__b&0x3f)<<__shift; \
697	++__count; \
698	__shift+=6; \
699	} \
700	} \
701	} \
702	}
703
704	/**
705	* Move the string offset from one code point boundary to the previous one
706	* and get the code point between them.
707	* (Pre-decrementing backward iteration.)
708	* "Safe" macro, checks for illegal sequences and for string boundaries.
709	*
710	* The input offset may be the same as the string length.
711	* If the offset is behind a multi-byte sequence, then the macro will read
712	* the whole sequence.
713	* If the offset is behind a lead byte, then that itself
714	* will be returned as the code point.
715	* If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
716	*
717	* @param s const uint8_t * string
718	* @param start int32_t starting string offset (usually 0)
719	* @param i int32_t string offset, must be start<i
720	* @param c output UChar32 variable, set to <0 in case of an error
721	* @see U8_PREV_UNSAFE
722	* @stable ICU 2.4
723	*/
724	#define U8_PREV(s, start, i, c) { \
725	(c)=(uint8_t)(s)[--(i)]; \
726	if(!U8_IS_SINGLE(c)) { \
727	(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
728	} \
729	}
730
731	/**
732	* Move the string offset from one code point boundary to the previous one
733	* and get the code point between them.
734	* (Pre-decrementing backward iteration.)
735	* "Safe" macro, checks for illegal sequences and for string boundaries.
736	*
737	* The input offset may be the same as the string length.
738	* If the offset is behind a multi-byte sequence, then the macro will read
739	* the whole sequence.
740	* If the offset is behind a lead byte, then that itself
741	* will be returned as the code point.
742	* If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
743	*
744	* This macro does not distinguish between a real U+FFFD in the text
745	* and U+FFFD returned for an ill-formed sequence.
746	* Use U8_PREV() if that distinction is important.
747	*
748	* @param s const uint8_t * string
749	* @param start int32_t starting string offset (usually 0)
750	* @param i int32_t string offset, must be start<i
751	* @param c output UChar32 variable, set to U+FFFD in case of an error
752	* @see U8_PREV
753	* @stable ICU 51
754	*/
755	#define U8_PREV_OR_FFFD(s, start, i, c) { \
756	(c)=(uint8_t)(s)[--(i)]; \
757	if(!U8_IS_SINGLE(c)) { \
758	(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
759	} \
760	}
761
762	/**
763	* Move the string offset from one code point boundary to the previous one.
764	* (Pre-decrementing backward iteration.)
765	* The input offset may be the same as the string length.
766	* "Unsafe" macro, assumes well-formed UTF-8.
767	*
768	* @param s const uint8_t * string
769	* @param i string offset
770	* @see U8_BACK_1
771	* @stable ICU 2.4
772	*/
773	#define U8_BACK_1_UNSAFE(s, i) { \
774	while(U8_IS_TRAIL((s)[--(i)])) {} \
775	}
776
777	/**
778	* Move the string offset from one code point boundary to the previous one.
779	* (Pre-decrementing backward iteration.)
780	* The input offset may be the same as the string length.
781	* "Safe" macro, checks for illegal sequences and for string boundaries.
782	*
783	* @param s const uint8_t * string
784	* @param start int32_t starting string offset (usually 0)
785	* @param i int32_t string offset, must be start<i
786	* @see U8_BACK_1_UNSAFE
787	* @stable ICU 2.4
788	*/
789	#define U8_BACK_1(s, start, i) { \
790	if(U8_IS_TRAIL((s)[--(i)])) { \
791	(i)=utf8_back1SafeBody(s, start, (i)); \
792	} \
793	}
794
795	/**
796	* Move the string offset from one code point boundary to the n-th one before it,
797	* i.e., move backward by n code points.
798	* (Pre-decrementing backward iteration.)
799	* The input offset may be the same as the string length.
800	* "Unsafe" macro, assumes well-formed UTF-8.
801	*
802	* @param s const uint8_t * string
803	* @param i string offset
804	* @param n number of code points to skip
805	* @see U8_BACK_N
806	* @stable ICU 2.4
807	*/
808	#define U8_BACK_N_UNSAFE(s, i, n) { \
809	int32_t __N=(n); \
810	while(__N>0) { \
811	U8_BACK_1_UNSAFE(s, i); \
812	--__N; \
813	} \
814	}
815
816	/**
817	* Move the string offset from one code point boundary to the n-th one before it,
818	* i.e., move backward by n code points.
819	* (Pre-decrementing backward iteration.)
820	* The input offset may be the same as the string length.
821	* "Safe" macro, checks for illegal sequences and for string boundaries.
822	*
823	* @param s const uint8_t * string
824	* @param start int32_t index of the start of the string
825	* @param i int32_t string offset, must be start<i
826	* @param n number of code points to skip
827	* @see U8_BACK_N_UNSAFE
828	* @stable ICU 2.4
829	*/
830	#define U8_BACK_N(s, start, i, n) { \
831	int32_t __N=(n); \
832	while(__N>0 && (i)>(start)) { \
833	U8_BACK_1(s, start, i); \
834	--__N; \
835	} \
836	}
837
838	/**
839	* Adjust a random-access offset to a code point boundary after a code point.
840	* If the offset is behind a partial multi-byte sequence,
841	* then the offset is incremented to behind the whole sequence.
842	* Otherwise, it is not modified.
843	* The input offset may be the same as the string length.
844	* "Unsafe" macro, assumes well-formed UTF-8.
845	*
846	* @param s const uint8_t * string
847	* @param i string offset
848	* @see U8_SET_CP_LIMIT
849	* @stable ICU 2.4
850	*/
851	#define U8_SET_CP_LIMIT_UNSAFE(s, i) { \
852	U8_BACK_1_UNSAFE(s, i); \
853	U8_FWD_1_UNSAFE(s, i); \
854	}
855
856	/**
857	* Adjust a random-access offset to a code point boundary after a code point.
858	* If the offset is behind a partial multi-byte sequence,
859	* then the offset is incremented to behind the whole sequence.
860	* Otherwise, it is not modified.
861	* The input offset may be the same as the string length.
862	* "Safe" macro, checks for illegal sequences and for string boundaries.
863	*
864	* The length can be negative for a NUL-terminated string.
865	*
866	* @param s const uint8_t * string
867	* @param start int32_t starting string offset (usually 0)
868	* @param i int32_t string offset, must be start<=i<=length
869	* @param length int32_t string length
870	* @see U8_SET_CP_LIMIT_UNSAFE
871	* @stable ICU 2.4
872	*/
873	#define U8_SET_CP_LIMIT(s, start, i, length) { \
874	if((start)<(i) && ((i)<(length) \|\| (length)<0)) { \
875	U8_BACK_1(s, start, i); \
876	U8_FWD_1(s, i, length); \
877	} \
878	}
879
880	#endif
881

Browse the source code of include/unicode/utf8.h