utf8.h source code [include/x86_64-linux-gnu/unicode/utf8.h]

1	/*
2	*******************************************************************************
3	*
4	* Copyright (C) 1999-2014, International Business Machines
5	* Corporation and others. All Rights Reserved.
6	*
7	*******************************************************************************
8	* file name: utf8.h
9	* encoding: US-ASCII
10	* tab size: 8 (not used)
11	* indentation:4
12	*
13	* created on: 1999sep13
14	* created by: Markus W. Scherer
15	*/
16
17	/**
18	* \file
19	* \brief C API: 8-bit Unicode handling macros
20	*
21	* This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
22	*
23	* For more information see utf.h and the ICU User Guide Strings chapter
24	* (http://userguide.icu-project.org/strings).
25	*
26	* <em>Usage:</em>
27	* ICU coding guidelines for if() statements should be followed when using these macros.
28	* Compound statements (curly braces {}) must be used for if-else-while...
29	* bodies and all macro statements should be terminated with semicolon.
30	*/
31
32	#ifndef __UTF8_H__
33	#define __UTF8_H__
34
35	#include "unicode/umachine.h"
36	#ifndef __UTF_H__
37	# include "unicode/utf.h"
38	#endif
39
40	/ internal definitions ----------------------------------------------------- /
41
42	/**
43	* \var utf8_countTrailBytes
44	* Internal array with numbers of trail bytes for any given byte used in
45	* lead byte position.
46	*
47	* This is internal since it is not meant to be called directly by external clients;
48	* however it is called by public macros in this file and thus must remain stable,
49	* and should not be hidden when other internal functions are hidden (otherwise
50	* public macros would fail to compile).
51	* @internal
52	*/
53	#ifdef U_UTF8_IMPL
54	U_EXPORT const uint8_t
55	#elif defined(U_STATIC_IMPLEMENTATION) \|\| defined(U_COMMON_IMPLEMENTATION)
56	U_CFUNC const uint8_t
57	#else
58	U_CFUNC U_IMPORT const uint8_t / U_IMPORT2? / /U_IMPORT/
59	#endif
60	utf8_countTrailBytes[`256`];
61
62	/**
63	* Counts the trail bytes for a UTF-8 lead byte.
64	* Returns 0 for 0..0xbf as well as for 0xfe and 0xff.
65	*
66	* This is internal since it is not meant to be called directly by external clients;
67	* however it is called by public macros in this file and thus must remain stable.
68	*
69	* Note: Beginning with ICU 50, the implementation uses a multi-condition expression
70	* which was shown in 2012 (on x86-64) to compile to fast, branch-free code.
71	* leadByte is evaluated multiple times.
72	*
73	* The pre-ICU 50 implementation used the exported array utf8_countTrailBytes:
74	* #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte])
75	* leadByte was evaluated exactly once.
76	*
77	* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
78	* @internal
79	*/
80	#define U8_COUNT_TRAIL_BYTES(leadByte) \
81	((uint8_t)(leadByte)<0xf0 ? \
82	((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \
83	(uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(leadByte)>=0xfc) : 0)
84
85	/**
86	* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
87	* The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.
88	* leadByte might be evaluated multiple times.
89	*
90	* This is internal since it is not meant to be called directly by external clients;
91	* however it is called by public macros in this file and thus must remain stable.
92	*
93	* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
94	* @internal
95	*/
96	#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
97	(((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0))
98
99	/**
100	* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
101	*
102	* This is internal since it is not meant to be called directly by external clients;
103	* however it is called by public macros in this file and thus must remain stable.
104	* @internal
105	*/
106	#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
107
108	/**
109	* Function for handling "next code point" with error-checking.
110	*
111	* This is internal since it is not meant to be called directly by external clients;
112	* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
113	* file and thus must remain stable, and should not be hidden when other internal
114	* functions are hidden (otherwise public macros would fail to compile).
115	* @internal
116	*/
117	U_STABLE UChar32 U_EXPORT2
118	utf8_nextCharSafeBody(const uint8_t s, int32_t pi, int32_t length, UChar32 c, UBool strict);
119
120	/**
121	* Function for handling "append code point" with error-checking.
122	*
123	* This is internal since it is not meant to be called directly by external clients;
124	* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
125	* file and thus must remain stable, and should not be hidden when other internal
126	* functions are hidden (otherwise public macros would fail to compile).
127	* @internal
128	*/
129	U_STABLE int32_t U_EXPORT2
130	utf8_appendCharSafeBody(uint8_t s, int32_t i, int32_t length, UChar32 c, UBool pIsError);
131
132	/**
133	* Function for handling "previous code point" with error-checking.
134	*
135	* This is internal since it is not meant to be called directly by external clients;
136	* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
137	* file and thus must remain stable, and should not be hidden when other internal
138	* functions are hidden (otherwise public macros would fail to compile).
139	* @internal
140	*/
141	U_STABLE UChar32 U_EXPORT2
142	utf8_prevCharSafeBody(const uint8_t s, int32_t start, int32_t pi, UChar32 c, UBool strict);
143
144	/**
145	* Function for handling "skip backward one code point" with error-checking.
146	*
147	* This is internal since it is not meant to be called directly by external clients;
148	* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
149	* file and thus must remain stable, and should not be hidden when other internal
150	* functions are hidden (otherwise public macros would fail to compile).
151	* @internal
152	*/
153	U_STABLE int32_t U_EXPORT2
154	utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
155
156	/ single-code point definitions -------------------------------------------- /
157
158	/**
159	* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
160	* @param c 8-bit code unit (byte)
161	* @return TRUE or FALSE
162	* @stable ICU 2.4
163	*/
164	#define U8_IS_SINGLE(c) (((c)&0x80)==0)
165
166	/**
167	* Is this code unit (byte) a UTF-8 lead byte?
168	* @param c 8-bit code unit (byte)
169	* @return TRUE or FALSE
170	* @stable ICU 2.4
171	*/
172	#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
173
174	/**
175	* Is this code unit (byte) a UTF-8 trail byte?
176	* @param c 8-bit code unit (byte)
177	* @return TRUE or FALSE
178	* @stable ICU 2.4
179	*/
180	#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)
181
182	/**
183	* How many code units (bytes) are used for the UTF-8 encoding
184	* of this Unicode code point?
185	* @param c 32-bit code point
186	* @return 1..4, or 0 if c is a surrogate or not a Unicode code point
187	* @stable ICU 2.4
188	*/
189	#define U8_LENGTH(c) \
190	((uint32_t)(c)<=0x7f ? 1 : \
191	((uint32_t)(c)<=0x7ff ? 2 : \
192	((uint32_t)(c)<=0xd7ff ? 3 : \
193	((uint32_t)(c)<=0xdfff \|\| (uint32_t)(c)>0x10ffff ? 0 : \
194	((uint32_t)(c)<=0xffff ? 3 : 4)\
195	) \
196	) \
197	) \
198	)
199
200	/**
201	* The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
202	* @return 4
203	* @stable ICU 2.4
204	*/
205	#define U8_MAX_LENGTH 4
206
207	/**
208	* Get a code point from a string at a random-access offset,
209	* without changing the offset.
210	* The offset may point to either the lead byte or one of the trail bytes
211	* for a code point, in which case the macro will read all of the bytes
212	* for the code point.
213	* The result is undefined if the offset points to an illegal UTF-8
214	* byte sequence.
215	* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
216	*
217	* @param s const uint8_t * string
218	* @param i string offset
219	* @param c output UChar32 variable
220	* @see U8_GET
221	* @stable ICU 2.4
222	*/
223	#define U8_GET_UNSAFE(s, i, c) { \
224	int32_t _u8_get_unsafe_index=(int32_t)(i); \
225	U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
226	U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
227	}
228
229	/**
230	* Get a code point from a string at a random-access offset,
231	* without changing the offset.
232	* The offset may point to either the lead byte or one of the trail bytes
233	* for a code point, in which case the macro will read all of the bytes
234	* for the code point.
235	*
236	* The length can be negative for a NUL-terminated string.
237	*
238	* If the offset points to an illegal UTF-8 byte sequence, then
239	* c is set to a negative value.
240	* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
241	*
242	* @param s const uint8_t * string
243	* @param start int32_t starting string offset
244	* @param i int32_t string offset, must be start<=i<length
245	* @param length int32_t string length
246	* @param c output UChar32 variable, set to <0 in case of an error
247	* @see U8_GET_UNSAFE
248	* @stable ICU 2.4
249	*/
250	#define U8_GET(s, start, i, length, c) { \
251	int32_t _u8_get_index=(i); \
252	U8_SET_CP_START(s, start, _u8_get_index); \
253	U8_NEXT(s, _u8_get_index, length, c); \
254	}
255
256	/**
257	* Get a code point from a string at a random-access offset,
258	* without changing the offset.
259	* The offset may point to either the lead byte or one of the trail bytes
260	* for a code point, in which case the macro will read all of the bytes
261	* for the code point.
262	*
263	* The length can be negative for a NUL-terminated string.
264	*
265	* If the offset points to an illegal UTF-8 byte sequence, then
266	* c is set to U+FFFD.
267	* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
268	*
269	* This macro does not distinguish between a real U+FFFD in the text
270	* and U+FFFD returned for an ill-formed sequence.
271	* Use U8_GET() if that distinction is important.
272	*
273	* @param s const uint8_t * string
274	* @param start int32_t starting string offset
275	* @param i int32_t string offset, must be start<=i<length
276	* @param length int32_t string length
277	* @param c output UChar32 variable, set to U+FFFD in case of an error
278	* @see U8_GET
279	* @stable ICU 51
280	*/
281	#define U8_GET_OR_FFFD(s, start, i, length, c) { \
282	int32_t _u8_get_index=(i); \
283	U8_SET_CP_START(s, start, _u8_get_index); \
284	U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
285	}
286
287	/ definitions with forward iteration --------------------------------------- /
288
289	/**
290	* Get a code point from a string at a code point boundary offset,
291	* and advance the offset to the next code point boundary.
292	* (Post-incrementing forward iteration.)
293	* "Unsafe" macro, assumes well-formed UTF-8.
294	*
295	* The offset may point to the lead byte of a multi-byte sequence,
296	* in which case the macro will read the whole sequence.
297	* The result is undefined if the offset points to a trail byte
298	* or an illegal UTF-8 sequence.
299	*
300	* @param s const uint8_t * string
301	* @param i string offset
302	* @param c output UChar32 variable
303	* @see U8_NEXT
304	* @stable ICU 2.4
305	*/
306	#define U8_NEXT_UNSAFE(s, i, c) { \
307	(c)=(uint8_t)(s)[(i)++]; \
308	if((c)>=0x80) { \
309	if((c)<0xe0) { \
310	(c)=(((c)&0x1f)<<6)\|((s)[(i)++]&0x3f); \
311	} else if((c)<0xf0) { \
312	/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
313	(c)=(UChar)(((c)<<12)\|(((s)[i]&0x3f)<<6)\|((s)[(i)+1]&0x3f)); \
314	(i)+=2; \
315	} else { \
316	(c)=(((c)&7)<<18)\|(((s)[i]&0x3f)<<12)\|(((s)[(i)+1]&0x3f)<<6)\|((s)[(i)+2]&0x3f); \
317	(i)+=3; \
318	} \
319	} \
320	}
321
322	/**
323	* Get a code point from a string at a code point boundary offset,
324	* and advance the offset to the next code point boundary.
325	* (Post-incrementing forward iteration.)
326	* "Safe" macro, checks for illegal sequences and for string boundaries.
327	*
328	* The length can be negative for a NUL-terminated string.
329	*
330	* The offset may point to the lead byte of a multi-byte sequence,
331	* in which case the macro will read the whole sequence.
332	* If the offset points to a trail byte or an illegal UTF-8 sequence, then
333	* c is set to a negative value.
334	*
335	* @param s const uint8_t * string
336	* @param i int32_t string offset, must be i<length
337	* @param length int32_t string length
338	* @param c output UChar32 variable, set to <0 in case of an error
339	* @see U8_NEXT_UNSAFE
340	* @stable ICU 2.4
341	*/
342	#define U8_NEXT(s, i, length, c) { \
343	(c)=(uint8_t)(s)[(i)++]; \
344	if((c)>=0x80) { \
345	uint8_t __t1, __t2; \
346	if( /* handle U+1000..U+CFFF inline */ \
347	(0xe0<(c) && (c)<=0xec) && \
348	(((i)+1)<(length) \|\| (length)<0) && \
349	(__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
350	(__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
351	) { \
352	/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
353	(c)=(UChar)(((c)<<12)\|(__t1<<6)\|__t2); \
354	(i)+=2; \
355	} else if( /* handle U+0080..U+07FF inline */ \
356	((c)<0xe0 && (c)>=0xc2) && \
357	((i)!=(length)) && \
358	(__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
359	) { \
360	(c)=(((c)&0x1f)<<6)\|__t1; \
361	++(i); \
362	} else { \
363	/* function call for "complicated" and error cases */ \
364	(c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \
365	} \
366	} \
367	}
368
369	/**
370	* Get a code point from a string at a code point boundary offset,
371	* and advance the offset to the next code point boundary.
372	* (Post-incrementing forward iteration.)
373	* "Safe" macro, checks for illegal sequences and for string boundaries.
374	*
375	* The length can be negative for a NUL-terminated string.
376	*
377	* The offset may point to the lead byte of a multi-byte sequence,
378	* in which case the macro will read the whole sequence.
379	* If the offset points to a trail byte or an illegal UTF-8 sequence, then
380	* c is set to U+FFFD.
381	*
382	* This macro does not distinguish between a real U+FFFD in the text
383	* and U+FFFD returned for an ill-formed sequence.
384	* Use U8_NEXT() if that distinction is important.
385	*
386	* @param s const uint8_t * string
387	* @param i int32_t string offset, must be i<length
388	* @param length int32_t string length
389	* @param c output UChar32 variable, set to U+FFFD in case of an error
390	* @see U8_NEXT
391	* @stable ICU 51
392	*/
393	#define U8_NEXT_OR_FFFD(s, i, length, c) { \
394	(c)=(uint8_t)(s)[(i)++]; \
395	if((c)>=0x80) { \
396	uint8_t __t1, __t2; \
397	if( /* handle U+1000..U+CFFF inline */ \
398	(0xe0<(c) && (c)<=0xec) && \
399	(((i)+1)<(length) \|\| (length)<0) && \
400	(__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
401	(__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
402	) { \
403	/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
404	(c)=(UChar)(((c)<<12)\|(__t1<<6)\|__t2); \
405	(i)+=2; \
406	} else if( /* handle U+0080..U+07FF inline */ \
407	((c)<0xe0 && (c)>=0xc2) && \
408	((i)!=(length)) && \
409	(__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
410	) { \
411	(c)=(((c)&0x1f)<<6)\|__t1; \
412	++(i); \
413	} else { \
414	/* function call for "complicated" and error cases */ \
415	(c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3); \
416	} \
417	} \
418	}
419
420	/**
421	* Append a code point to a string, overwriting 1 to 4 bytes.
422	* The offset points to the current end of the string contents
423	* and is advanced (post-increment).
424	* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
425	* Otherwise, the result is undefined.
426	*
427	* @param s const uint8_t * string buffer
428	* @param i string offset
429	* @param c code point to append
430	* @see U8_APPEND
431	* @stable ICU 2.4
432	*/
433	#define U8_APPEND_UNSAFE(s, i, c) { \
434	if((uint32_t)(c)<=0x7f) { \
435	(s)[(i)++]=(uint8_t)(c); \
436	} else { \
437	if((uint32_t)(c)<=0x7ff) { \
438	(s)[(i)++]=(uint8_t)(((c)>>6)\|0xc0); \
439	} else { \
440	if((uint32_t)(c)<=0xffff) { \
441	(s)[(i)++]=(uint8_t)(((c)>>12)\|0xe0); \
442	} else { \
443	(s)[(i)++]=(uint8_t)(((c)>>18)\|0xf0); \
444	(s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)\|0x80); \
445	} \
446	(s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)\|0x80); \
447	} \
448	(s)[(i)++]=(uint8_t)(((c)&0x3f)\|0x80); \
449	} \
450	}
451
452	/**
453	* Append a code point to a string, overwriting 1 to 4 bytes.
454	* The offset points to the current end of the string contents
455	* and is advanced (post-increment).
456	* "Safe" macro, checks for a valid code point.
457	* If a non-ASCII code point is written, checks for sufficient space in the string.
458	* If the code point is not valid or trail bytes do not fit,
459	* then isError is set to TRUE.
460	*
461	* @param s const uint8_t * string buffer
462	* @param i int32_t string offset, must be i<capacity
463	* @param capacity int32_t size of the string buffer
464	* @param c UChar32 code point to append
465	* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
466	* @see U8_APPEND_UNSAFE
467	* @stable ICU 2.4
468	*/
469	#define U8_APPEND(s, i, capacity, c, isError) { \
470	if((uint32_t)(c)<=0x7f) { \
471	(s)[(i)++]=(uint8_t)(c); \
472	} else if((uint32_t)(c)<=0x7ff && (i)+1<(capacity)) { \
473	(s)[(i)++]=(uint8_t)(((c)>>6)\|0xc0); \
474	(s)[(i)++]=(uint8_t)(((c)&0x3f)\|0x80); \
475	} else if((uint32_t)(c)<=0xd7ff && (i)+2<(capacity)) { \
476	(s)[(i)++]=(uint8_t)(((c)>>12)\|0xe0); \
477	(s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)\|0x80); \
478	(s)[(i)++]=(uint8_t)(((c)&0x3f)\|0x80); \
479	} else { \
480	(i)=utf8_appendCharSafeBody(s, (i), (capacity), c, &(isError)); \
481	} \
482	}
483
484	/**
485	* Advance the string offset from one code point boundary to the next.
486	* (Post-incrementing iteration.)
487	* "Unsafe" macro, assumes well-formed UTF-8.
488	*
489	* @param s const uint8_t * string
490	* @param i string offset
491	* @see U8_FWD_1
492	* @stable ICU 2.4
493	*/
494	#define U8_FWD_1_UNSAFE(s, i) { \
495	(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \
496	}
497
498	/**
499	* Advance the string offset from one code point boundary to the next.
500	* (Post-incrementing iteration.)
501	* "Safe" macro, checks for illegal sequences and for string boundaries.
502	*
503	* The length can be negative for a NUL-terminated string.
504	*
505	* @param s const uint8_t * string
506	* @param i int32_t string offset, must be i<length
507	* @param length int32_t string length
508	* @see U8_FWD_1_UNSAFE
509	* @stable ICU 2.4
510	*/
511	#define U8_FWD_1(s, i, length) { \
512	uint8_t __b=(uint8_t)(s)[(i)++]; \
513	if(U8_IS_LEAD(__b)) { \
514	uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \
515	if((i)+__count>(length) && (length)>=0) { \
516	__count=(uint8_t)((length)-(i)); \
517	} \
518	while(__count>0 && U8_IS_TRAIL((s)[i])) { \
519	++(i); \
520	--__count; \
521	} \
522	} \
523	}
524
525	/**
526	* Advance the string offset from one code point boundary to the n-th next one,
527	* i.e., move forward by n code points.
528	* (Post-incrementing iteration.)
529	* "Unsafe" macro, assumes well-formed UTF-8.
530	*
531	* @param s const uint8_t * string
532	* @param i string offset
533	* @param n number of code points to skip
534	* @see U8_FWD_N
535	* @stable ICU 2.4
536	*/
537	#define U8_FWD_N_UNSAFE(s, i, n) { \
538	int32_t __N=(n); \
539	while(__N>0) { \
540	U8_FWD_1_UNSAFE(s, i); \
541	--__N; \
542	} \
543	}
544
545	/**
546	* Advance the string offset from one code point boundary to the n-th next one,
547	* i.e., move forward by n code points.
548	* (Post-incrementing iteration.)
549	* "Safe" macro, checks for illegal sequences and for string boundaries.
550	*
551	* The length can be negative for a NUL-terminated string.
552	*
553	* @param s const uint8_t * string
554	* @param i int32_t string offset, must be i<length
555	* @param length int32_t string length
556	* @param n number of code points to skip
557	* @see U8_FWD_N_UNSAFE
558	* @stable ICU 2.4
559	*/
560	#define U8_FWD_N(s, i, length, n) { \
561	int32_t __N=(n); \
562	while(__N>0 && ((i)<(length) \|\| ((length)<0 && (s)[i]!=0))) { \
563	U8_FWD_1(s, i, length); \
564	--__N; \
565	} \
566	}
567
568	/**
569	* Adjust a random-access offset to a code point boundary
570	* at the start of a code point.
571	* If the offset points to a UTF-8 trail byte,
572	* then the offset is moved backward to the corresponding lead byte.
573	* Otherwise, it is not modified.
574	* "Unsafe" macro, assumes well-formed UTF-8.
575	*
576	* @param s const uint8_t * string
577	* @param i string offset
578	* @see U8_SET_CP_START
579	* @stable ICU 2.4
580	*/
581	#define U8_SET_CP_START_UNSAFE(s, i) { \
582	while(U8_IS_TRAIL((s)[i])) { --(i); } \
583	}
584
585	/**
586	* Adjust a random-access offset to a code point boundary
587	* at the start of a code point.
588	* If the offset points to a UTF-8 trail byte,
589	* then the offset is moved backward to the corresponding lead byte.
590	* Otherwise, it is not modified.
591	* "Safe" macro, checks for illegal sequences and for string boundaries.
592	*
593	* @param s const uint8_t * string
594	* @param start int32_t starting string offset (usually 0)
595	* @param i int32_t string offset, must be start<=i
596	* @see U8_SET_CP_START_UNSAFE
597	* @stable ICU 2.4
598	*/
599	#define U8_SET_CP_START(s, start, i) { \
600	if(U8_IS_TRAIL((s)[(i)])) { \
601	(i)=utf8_back1SafeBody(s, start, (i)); \
602	} \
603	}
604
605	/ definitions with backward iteration -------------------------------------- /
606
607	/**
608	* Move the string offset from one code point boundary to the previous one
609	* and get the code point between them.
610	* (Pre-decrementing backward iteration.)
611	* "Unsafe" macro, assumes well-formed UTF-8.
612	*
613	* The input offset may be the same as the string length.
614	* If the offset is behind a multi-byte sequence, then the macro will read
615	* the whole sequence.
616	* If the offset is behind a lead byte, then that itself
617	* will be returned as the code point.
618	* The result is undefined if the offset is behind an illegal UTF-8 sequence.
619	*
620	* @param s const uint8_t * string
621	* @param i string offset
622	* @param c output UChar32 variable
623	* @see U8_PREV
624	* @stable ICU 2.4
625	*/
626	#define U8_PREV_UNSAFE(s, i, c) { \
627	(c)=(uint8_t)(s)[--(i)]; \
628	if(U8_IS_TRAIL(c)) { \
629	uint8_t __b, __count=1, __shift=6; \
630	\
631	/* c is a trail byte */ \
632	(c)&=0x3f; \
633	for(;;) { \
634	__b=(uint8_t)(s)[--(i)]; \
635	if(__b>=0xc0) { \
636	U8_MASK_LEAD_BYTE(__b, __count); \
637	(c)\|=(UChar32)__b<<__shift; \
638	break; \
639	} else { \
640	(c)\|=(UChar32)(__b&0x3f)<<__shift; \
641	++__count; \
642	__shift+=6; \
643	} \
644	} \
645	} \
646	}
647
648	/**
649	* Move the string offset from one code point boundary to the previous one
650	* and get the code point between them.
651	* (Pre-decrementing backward iteration.)
652	* "Safe" macro, checks for illegal sequences and for string boundaries.
653	*
654	* The input offset may be the same as the string length.
655	* If the offset is behind a multi-byte sequence, then the macro will read
656	* the whole sequence.
657	* If the offset is behind a lead byte, then that itself
658	* will be returned as the code point.
659	* If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
660	*
661	* @param s const uint8_t * string
662	* @param start int32_t starting string offset (usually 0)
663	* @param i int32_t string offset, must be start<i
664	* @param c output UChar32 variable, set to <0 in case of an error
665	* @see U8_PREV_UNSAFE
666	* @stable ICU 2.4
667	*/
668	#define U8_PREV(s, start, i, c) { \
669	(c)=(uint8_t)(s)[--(i)]; \
670	if((c)>=0x80) { \
671	(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
672	} \
673	}
674
675	/**
676	* Move the string offset from one code point boundary to the previous one
677	* and get the code point between them.
678	* (Pre-decrementing backward iteration.)
679	* "Safe" macro, checks for illegal sequences and for string boundaries.
680	*
681	* The input offset may be the same as the string length.
682	* If the offset is behind a multi-byte sequence, then the macro will read
683	* the whole sequence.
684	* If the offset is behind a lead byte, then that itself
685	* will be returned as the code point.
686	* If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
687	*
688	* This macro does not distinguish between a real U+FFFD in the text
689	* and U+FFFD returned for an ill-formed sequence.
690	* Use U8_PREV() if that distinction is important.
691	*
692	* @param s const uint8_t * string
693	* @param start int32_t starting string offset (usually 0)
694	* @param i int32_t string offset, must be start<i
695	* @param c output UChar32 variable, set to U+FFFD in case of an error
696	* @see U8_PREV
697	* @stable ICU 51
698	*/
699	#define U8_PREV_OR_FFFD(s, start, i, c) { \
700	(c)=(uint8_t)(s)[--(i)]; \
701	if((c)>=0x80) { \
702	(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
703	} \
704	}
705
706	/**
707	* Move the string offset from one code point boundary to the previous one.
708	* (Pre-decrementing backward iteration.)
709	* The input offset may be the same as the string length.
710	* "Unsafe" macro, assumes well-formed UTF-8.
711	*
712	* @param s const uint8_t * string
713	* @param i string offset
714	* @see U8_BACK_1
715	* @stable ICU 2.4
716	*/
717	#define U8_BACK_1_UNSAFE(s, i) { \
718	while(U8_IS_TRAIL((s)[--(i)])) {} \
719	}
720
721	/**
722	* Move the string offset from one code point boundary to the previous one.
723	* (Pre-decrementing backward iteration.)
724	* The input offset may be the same as the string length.
725	* "Safe" macro, checks for illegal sequences and for string boundaries.
726	*
727	* @param s const uint8_t * string
728	* @param start int32_t starting string offset (usually 0)
729	* @param i int32_t string offset, must be start<i
730	* @see U8_BACK_1_UNSAFE
731	* @stable ICU 2.4
732	*/
733	#define U8_BACK_1(s, start, i) { \
734	if(U8_IS_TRAIL((s)[--(i)])) { \
735	(i)=utf8_back1SafeBody(s, start, (i)); \
736	} \
737	}
738
739	/**
740	* Move the string offset from one code point boundary to the n-th one before it,
741	* i.e., move backward by n code points.
742	* (Pre-decrementing backward iteration.)
743	* The input offset may be the same as the string length.
744	* "Unsafe" macro, assumes well-formed UTF-8.
745	*
746	* @param s const uint8_t * string
747	* @param i string offset
748	* @param n number of code points to skip
749	* @see U8_BACK_N
750	* @stable ICU 2.4
751	*/
752	#define U8_BACK_N_UNSAFE(s, i, n) { \
753	int32_t __N=(n); \
754	while(__N>0) { \
755	U8_BACK_1_UNSAFE(s, i); \
756	--__N; \
757	} \
758	}
759
760	/**
761	* Move the string offset from one code point boundary to the n-th one before it,
762	* i.e., move backward by n code points.
763	* (Pre-decrementing backward iteration.)
764	* The input offset may be the same as the string length.
765	* "Safe" macro, checks for illegal sequences and for string boundaries.
766	*
767	* @param s const uint8_t * string
768	* @param start int32_t index of the start of the string
769	* @param i int32_t string offset, must be start<i
770	* @param n number of code points to skip
771	* @see U8_BACK_N_UNSAFE
772	* @stable ICU 2.4
773	*/
774	#define U8_BACK_N(s, start, i, n) { \
775	int32_t __N=(n); \
776	while(__N>0 && (i)>(start)) { \
777	U8_BACK_1(s, start, i); \
778	--__N; \
779	} \
780	}
781
782	/**
783	* Adjust a random-access offset to a code point boundary after a code point.
784	* If the offset is behind a partial multi-byte sequence,
785	* then the offset is incremented to behind the whole sequence.
786	* Otherwise, it is not modified.
787	* The input offset may be the same as the string length.
788	* "Unsafe" macro, assumes well-formed UTF-8.
789	*
790	* @param s const uint8_t * string
791	* @param i string offset
792	* @see U8_SET_CP_LIMIT
793	* @stable ICU 2.4
794	*/
795	#define U8_SET_CP_LIMIT_UNSAFE(s, i) { \
796	U8_BACK_1_UNSAFE(s, i); \
797	U8_FWD_1_UNSAFE(s, i); \
798	}
799
800	/**
801	* Adjust a random-access offset to a code point boundary after a code point.
802	* If the offset is behind a partial multi-byte sequence,
803	* then the offset is incremented to behind the whole sequence.
804	* Otherwise, it is not modified.
805	* The input offset may be the same as the string length.
806	* "Safe" macro, checks for illegal sequences and for string boundaries.
807	*
808	* The length can be negative for a NUL-terminated string.
809	*
810	* @param s const uint8_t * string
811	* @param start int32_t starting string offset (usually 0)
812	* @param i int32_t string offset, must be start<=i<=length
813	* @param length int32_t string length
814	* @see U8_SET_CP_LIMIT_UNSAFE
815	* @stable ICU 2.4
816	*/
817	#define U8_SET_CP_LIMIT(s, start, i, length) { \
818	if((start)<(i) && ((i)<(length) \|\| ((length)<0 && (s)[i]!=0))) { \
819	U8_BACK_1(s, start, i); \
820	U8_FWD_1(s, i, length); \
821	} \
822	}
823
824	#endif
825

Browse the source code of include/x86_64-linux-gnu/unicode/utf8.h