1/*
2*******************************************************************************
3*
4* Copyright (C) 2002-2011 International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: uiter.h
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2002jan18
14* created by: Markus W. Scherer
15*/
16
17#ifndef __UITER_H__
18#define __UITER_H__
19
20/**
21 * \file
22 * \brief C API: Unicode Character Iteration
23 *
24 * @see UCharIterator
25 */
26
27#include "unicode/utypes.h"
28
29#if U_SHOW_CPLUSPLUS_API
30 U_NAMESPACE_BEGIN
31
32 class CharacterIterator;
33 class Replaceable;
34
35 U_NAMESPACE_END
36#endif
37
38U_CDECL_BEGIN
39
40struct UCharIterator;
41typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
42
43/**
44 * Origin constants for UCharIterator.getIndex() and UCharIterator.move().
45 * @see UCharIteratorMove
46 * @see UCharIterator
47 * @stable ICU 2.1
48 */
49typedef enum UCharIteratorOrigin {
50 UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH
51} UCharIteratorOrigin;
52
53/** Constants for UCharIterator. @stable ICU 2.6 */
54enum {
55 /**
56 * Constant value that may be returned by UCharIteratorMove
57 * indicating that the final UTF-16 index is not known, but that the move succeeded.
58 * This can occur when moving relative to limit or length, or
59 * when moving relative to the current index after a setState()
60 * when the current UTF-16 index is not known.
61 *
62 * It would be very inefficient to have to count from the beginning of the text
63 * just to get the current/limit/length index after moving relative to it.
64 * The actual index can be determined with getIndex(UITER_CURRENT)
65 * which will count the UChars if necessary.
66 *
67 * @stable ICU 2.6
68 */
69 UITER_UNKNOWN_INDEX=-2
70};
71
72
73/**
74 * Constant for UCharIterator getState() indicating an error or
75 * an unknown state.
76 * Returned by uiter_getState()/UCharIteratorGetState
77 * when an error occurs.
78 * Also, some UCharIterator implementations may not be able to return
79 * a valid state for each position. This will be clearly documented
80 * for each such iterator (none of the public ones here).
81 *
82 * @stable ICU 2.6
83 */
84#define UITER_NO_STATE ((uint32_t)0xffffffff)
85
86/**
87 * Function type declaration for UCharIterator.getIndex().
88 *
89 * Gets the current position, or the start or limit of the
90 * iteration range.
91 *
92 * This function may perform slowly for UITER_CURRENT after setState() was called,
93 * or for UITER_LENGTH, because an iterator implementation may have to count
94 * UChars if the underlying storage is not UTF-16.
95 *
96 * @param iter the UCharIterator structure ("this pointer")
97 * @param origin get the 0, start, limit, length, or current index
98 * @return the requested index, or U_SENTINEL in an error condition
99 *
100 * @see UCharIteratorOrigin
101 * @see UCharIterator
102 * @stable ICU 2.1
103 */
104typedef int32_t U_CALLCONV
105UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin);
106
107/**
108 * Function type declaration for UCharIterator.move().
109 *
110 * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
111 *
112 * Moves the current position relative to the start or limit of the
113 * iteration range, or relative to the current position itself.
114 * The movement is expressed in numbers of code units forward
115 * or backward by specifying a positive or negative delta.
116 * Out of bounds movement will be pinned to the start or limit.
117 *
118 * This function may perform slowly for moving relative to UITER_LENGTH
119 * because an iterator implementation may have to count the rest of the
120 * UChars if the native storage is not UTF-16.
121 *
122 * When moving relative to the limit or length, or
123 * relative to the current position after setState() was called,
124 * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
125 * determination of the actual UTF-16 index.
126 * The actual index can be determined with getIndex(UITER_CURRENT)
127 * which will count the UChars if necessary.
128 * See UITER_UNKNOWN_INDEX for details.
129 *
130 * @param iter the UCharIterator structure ("this pointer")
131 * @param delta can be positive, zero, or negative
132 * @param origin move relative to the 0, start, limit, length, or current index
133 * @return the new index, or U_SENTINEL on an error condition,
134 * or UITER_UNKNOWN_INDEX when the index is not known.
135 *
136 * @see UCharIteratorOrigin
137 * @see UCharIterator
138 * @see UITER_UNKNOWN_INDEX
139 * @stable ICU 2.1
140 */
141typedef int32_t U_CALLCONV
142UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin);
143
144/**
145 * Function type declaration for UCharIterator.hasNext().
146 *
147 * Check if current() and next() can still
148 * return another code unit.
149 *
150 * @param iter the UCharIterator structure ("this pointer")
151 * @return boolean value for whether current() and next() can still return another code unit
152 *
153 * @see UCharIterator
154 * @stable ICU 2.1
155 */
156typedef UBool U_CALLCONV
157UCharIteratorHasNext(UCharIterator *iter);
158
159/**
160 * Function type declaration for UCharIterator.hasPrevious().
161 *
162 * Check if previous() can still return another code unit.
163 *
164 * @param iter the UCharIterator structure ("this pointer")
165 * @return boolean value for whether previous() can still return another code unit
166 *
167 * @see UCharIterator
168 * @stable ICU 2.1
169 */
170typedef UBool U_CALLCONV
171UCharIteratorHasPrevious(UCharIterator *iter);
172
173/**
174 * Function type declaration for UCharIterator.current().
175 *
176 * Return the code unit at the current position,
177 * or U_SENTINEL if there is none (index is at the limit).
178 *
179 * @param iter the UCharIterator structure ("this pointer")
180 * @return the current code unit
181 *
182 * @see UCharIterator
183 * @stable ICU 2.1
184 */
185typedef UChar32 U_CALLCONV
186UCharIteratorCurrent(UCharIterator *iter);
187
188/**
189 * Function type declaration for UCharIterator.next().
190 *
191 * Return the code unit at the current index and increment
192 * the index (post-increment, like s[i++]),
193 * or return U_SENTINEL if there is none (index is at the limit).
194 *
195 * @param iter the UCharIterator structure ("this pointer")
196 * @return the current code unit (and post-increment the current index)
197 *
198 * @see UCharIterator
199 * @stable ICU 2.1
200 */
201typedef UChar32 U_CALLCONV
202UCharIteratorNext(UCharIterator *iter);
203
204/**
205 * Function type declaration for UCharIterator.previous().
206 *
207 * Decrement the index and return the code unit from there
208 * (pre-decrement, like s[--i]),
209 * or return U_SENTINEL if there is none (index is at the start).
210 *
211 * @param iter the UCharIterator structure ("this pointer")
212 * @return the previous code unit (after pre-decrementing the current index)
213 *
214 * @see UCharIterator
215 * @stable ICU 2.1
216 */
217typedef UChar32 U_CALLCONV
218UCharIteratorPrevious(UCharIterator *iter);
219
220/**
221 * Function type declaration for UCharIterator.reservedFn().
222 * Reserved for future use.
223 *
224 * @param iter the UCharIterator structure ("this pointer")
225 * @param something some integer argument
226 * @return some integer
227 *
228 * @see UCharIterator
229 * @stable ICU 2.1
230 */
231typedef int32_t U_CALLCONV
232UCharIteratorReserved(UCharIterator *iter, int32_t something);
233
234/**
235 * Function type declaration for UCharIterator.getState().
236 *
237 * Get the "state" of the iterator in the form of a single 32-bit word.
238 * It is recommended that the state value be calculated to be as small as
239 * is feasible. For strings with limited lengths, fewer than 32 bits may
240 * be sufficient.
241 *
242 * This is used together with setState()/UCharIteratorSetState
243 * to save and restore the iterator position more efficiently than with
244 * getIndex()/move().
245 *
246 * The iterator state is defined as a uint32_t value because it is designed
247 * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state
248 * of the character iterator.
249 *
250 * With some UCharIterator implementations (e.g., UTF-8),
251 * getting and setting the UTF-16 index with existing functions
252 * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
253 * relatively slow because the iterator has to "walk" from a known index
254 * to the requested one.
255 * This takes more time the farther it needs to go.
256 *
257 * An opaque state value allows an iterator implementation to provide
258 * an internal index (UTF-8: the source byte array index) for
259 * fast, constant-time restoration.
260 *
261 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
262 * the UTF-16 index may not be restored as well, but the iterator can deliver
263 * the correct text contents and move relative to the current position
264 * without performance degradation.
265 *
266 * Some UCharIterator implementations may not be able to return
267 * a valid state for each position, in which case they return UITER_NO_STATE instead.
268 * This will be clearly documented for each such iterator (none of the public ones here).
269 *
270 * @param iter the UCharIterator structure ("this pointer")
271 * @return the state word
272 *
273 * @see UCharIterator
274 * @see UCharIteratorSetState
275 * @see UITER_NO_STATE
276 * @stable ICU 2.6
277 */
278typedef uint32_t U_CALLCONV
279UCharIteratorGetState(const UCharIterator *iter);
280
281/**
282 * Function type declaration for UCharIterator.setState().
283 *
284 * Restore the "state" of the iterator using a state word from a getState() call.
285 * The iterator object need not be the same one as for which getState() was called,
286 * but it must be of the same type (set up using the same uiter_setXYZ function)
287 * and it must iterate over the same string
288 * (binary identical regardless of memory address).
289 * For more about the state word see UCharIteratorGetState.
290 *
291 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
292 * the UTF-16 index may not be restored as well, but the iterator can deliver
293 * the correct text contents and move relative to the current position
294 * without performance degradation.
295 *
296 * @param iter the UCharIterator structure ("this pointer")
297 * @param state the state word from a getState() call
298 * on a same-type, same-string iterator
299 * @param pErrorCode Must be a valid pointer to an error code value,
300 * which must not indicate a failure before the function call.
301 *
302 * @see UCharIterator
303 * @see UCharIteratorGetState
304 * @stable ICU 2.6
305 */
306typedef void U_CALLCONV
307UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
308
309
310/**
311 * C API for code unit iteration.
312 * This can be used as a C wrapper around
313 * CharacterIterator, Replaceable, or implemented using simple strings, etc.
314 *
315 * There are two roles for using UCharIterator:
316 *
317 * A "provider" sets the necessary function pointers and controls the "protected"
318 * fields of the UCharIterator structure. A "provider" passes a UCharIterator
319 * into C APIs that need a UCharIterator as an abstract, flexible string interface.
320 *
321 * Implementations of such C APIs are "callers" of UCharIterator functions;
322 * they only use the "public" function pointers and never access the "protected"
323 * fields directly.
324 *
325 * The current() and next() functions only check the current index against the
326 * limit, and previous() only checks the current index against the start,
327 * to see if the iterator already reached the end of the iteration range.
328 *
329 * The assumption - in all iterators - is that the index is moved via the API,
330 * which means it won't go out of bounds, or the index is modified by
331 * user code that knows enough about the iterator implementation to set valid
332 * index values.
333 *
334 * UCharIterator functions return code unit values 0..0xffff,
335 * or U_SENTINEL if the iteration bounds are reached.
336 *
337 * @stable ICU 2.1
338 */
339struct UCharIterator {
340 /**
341 * (protected) Pointer to string or wrapped object or similar.
342 * Not used by caller.
343 * @stable ICU 2.1
344 */
345 const void *context;
346
347 /**
348 * (protected) Length of string or similar.
349 * Not used by caller.
350 * @stable ICU 2.1
351 */
352 int32_t length;
353
354 /**
355 * (protected) Start index or similar.
356 * Not used by caller.
357 * @stable ICU 2.1
358 */
359 int32_t start;
360
361 /**
362 * (protected) Current index or similar.
363 * Not used by caller.
364 * @stable ICU 2.1
365 */
366 int32_t index;
367
368 /**
369 * (protected) Limit index or similar.
370 * Not used by caller.
371 * @stable ICU 2.1
372 */
373 int32_t limit;
374
375 /**
376 * (protected) Used by UTF-8 iterators and possibly others.
377 * @stable ICU 2.1
378 */
379 int32_t reservedField;
380
381 /**
382 * (public) Returns the current position or the
383 * start or limit index of the iteration range.
384 *
385 * @see UCharIteratorGetIndex
386 * @stable ICU 2.1
387 */
388 UCharIteratorGetIndex *getIndex;
389
390 /**
391 * (public) Moves the current position relative to the start or limit of the
392 * iteration range, or relative to the current position itself.
393 * The movement is expressed in numbers of code units forward
394 * or backward by specifying a positive or negative delta.
395 *
396 * @see UCharIteratorMove
397 * @stable ICU 2.1
398 */
399 UCharIteratorMove *move;
400
401 /**
402 * (public) Check if current() and next() can still
403 * return another code unit.
404 *
405 * @see UCharIteratorHasNext
406 * @stable ICU 2.1
407 */
408 UCharIteratorHasNext *hasNext;
409
410 /**
411 * (public) Check if previous() can still return another code unit.
412 *
413 * @see UCharIteratorHasPrevious
414 * @stable ICU 2.1
415 */
416 UCharIteratorHasPrevious *hasPrevious;
417
418 /**
419 * (public) Return the code unit at the current position,
420 * or U_SENTINEL if there is none (index is at the limit).
421 *
422 * @see UCharIteratorCurrent
423 * @stable ICU 2.1
424 */
425 UCharIteratorCurrent *current;
426
427 /**
428 * (public) Return the code unit at the current index and increment
429 * the index (post-increment, like s[i++]),
430 * or return U_SENTINEL if there is none (index is at the limit).
431 *
432 * @see UCharIteratorNext
433 * @stable ICU 2.1
434 */
435 UCharIteratorNext *next;
436
437 /**
438 * (public) Decrement the index and return the code unit from there
439 * (pre-decrement, like s[--i]),
440 * or return U_SENTINEL if there is none (index is at the start).
441 *
442 * @see UCharIteratorPrevious
443 * @stable ICU 2.1
444 */
445 UCharIteratorPrevious *previous;
446
447 /**
448 * (public) Reserved for future use. Currently NULL.
449 *
450 * @see UCharIteratorReserved
451 * @stable ICU 2.1
452 */
453 UCharIteratorReserved *reservedFn;
454
455 /**
456 * (public) Return the state of the iterator, to be restored later with setState().
457 * This function pointer is NULL if the iterator does not implement it.
458 *
459 * @see UCharIteratorGet
460 * @stable ICU 2.6
461 */
462 UCharIteratorGetState *getState;
463
464 /**
465 * (public) Restore the iterator state from the state word from a call
466 * to getState().
467 * This function pointer is NULL if the iterator does not implement it.
468 *
469 * @see UCharIteratorSet
470 * @stable ICU 2.6
471 */
472 UCharIteratorSetState *setState;
473};
474
475/**
476 * Helper function for UCharIterator to get the code point
477 * at the current index.
478 *
479 * Return the code point that includes the code unit at the current position,
480 * or U_SENTINEL if there is none (index is at the limit).
481 * If the current code unit is a lead or trail surrogate,
482 * then the following or preceding surrogate is used to form
483 * the code point value.
484 *
485 * @param iter the UCharIterator structure ("this pointer")
486 * @return the current code point
487 *
488 * @see UCharIterator
489 * @see U16_GET
490 * @see UnicodeString::char32At()
491 * @stable ICU 2.1
492 */
493U_STABLE UChar32 U_EXPORT2
494uiter_current32(UCharIterator *iter);
495
496/**
497 * Helper function for UCharIterator to get the next code point.
498 *
499 * Return the code point at the current index and increment
500 * the index (post-increment, like s[i++]),
501 * or return U_SENTINEL if there is none (index is at the limit).
502 *
503 * @param iter the UCharIterator structure ("this pointer")
504 * @return the current code point (and post-increment the current index)
505 *
506 * @see UCharIterator
507 * @see U16_NEXT
508 * @stable ICU 2.1
509 */
510U_STABLE UChar32 U_EXPORT2
511uiter_next32(UCharIterator *iter);
512
513/**
514 * Helper function for UCharIterator to get the previous code point.
515 *
516 * Decrement the index and return the code point from there
517 * (pre-decrement, like s[--i]),
518 * or return U_SENTINEL if there is none (index is at the start).
519 *
520 * @param iter the UCharIterator structure ("this pointer")
521 * @return the previous code point (after pre-decrementing the current index)
522 *
523 * @see UCharIterator
524 * @see U16_PREV
525 * @stable ICU 2.1
526 */
527U_STABLE UChar32 U_EXPORT2
528uiter_previous32(UCharIterator *iter);
529
530/**
531 * Get the "state" of the iterator in the form of a single 32-bit word.
532 * This is a convenience function that calls iter->getState(iter)
533 * if iter->getState is not NULL;
534 * if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
535 *
536 * Some UCharIterator implementations may not be able to return
537 * a valid state for each position, in which case they return UITER_NO_STATE instead.
538 * This will be clearly documented for each such iterator (none of the public ones here).
539 *
540 * @param iter the UCharIterator structure ("this pointer")
541 * @return the state word
542 *
543 * @see UCharIterator
544 * @see UCharIteratorGetState
545 * @see UITER_NO_STATE
546 * @stable ICU 2.6
547 */
548U_STABLE uint32_t U_EXPORT2
549uiter_getState(const UCharIterator *iter);
550
551/**
552 * Restore the "state" of the iterator using a state word from a getState() call.
553 * This is a convenience function that calls iter->setState(iter, state, pErrorCode)
554 * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
555 *
556 * @param iter the UCharIterator structure ("this pointer")
557 * @param state the state word from a getState() call
558 * on a same-type, same-string iterator
559 * @param pErrorCode Must be a valid pointer to an error code value,
560 * which must not indicate a failure before the function call.
561 *
562 * @see UCharIterator
563 * @see UCharIteratorSetState
564 * @stable ICU 2.6
565 */
566U_STABLE void U_EXPORT2
567uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
568
569/**
570 * Set up a UCharIterator to iterate over a string.
571 *
572 * Sets the UCharIterator function pointers for iteration over the string s
573 * with iteration boundaries start=index=0 and length=limit=string length.
574 * The "provider" may set the start, index, and limit values at any time
575 * within the range 0..length.
576 * The length field will be ignored.
577 *
578 * The string pointer s is set into UCharIterator.context without copying
579 * or reallocating the string contents.
580 *
581 * getState() simply returns the current index.
582 * move() will always return the final index.
583 *
584 * @param iter UCharIterator structure to be set for iteration
585 * @param s String to iterate over
586 * @param length Length of s, or -1 if NUL-terminated
587 *
588 * @see UCharIterator
589 * @stable ICU 2.1
590 */
591U_STABLE void U_EXPORT2
592uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
593
594/**
595 * Set up a UCharIterator to iterate over a UTF-16BE string
596 * (byte vector with a big-endian pair of bytes per UChar).
597 *
598 * Everything works just like with a normal UChar iterator (uiter_setString),
599 * except that UChars are assembled from byte pairs,
600 * and that the length argument here indicates an even number of bytes.
601 *
602 * getState() simply returns the current index.
603 * move() will always return the final index.
604 *
605 * @param iter UCharIterator structure to be set for iteration
606 * @param s UTF-16BE string to iterate over
607 * @param length Length of s as an even number of bytes, or -1 if NUL-terminated
608 * (NUL means pair of 0 bytes at even index from s)
609 *
610 * @see UCharIterator
611 * @see uiter_setString
612 * @stable ICU 2.6
613 */
614U_STABLE void U_EXPORT2
615uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);
616
617/**
618 * Set up a UCharIterator to iterate over a UTF-8 string.
619 *
620 * Sets the UCharIterator function pointers for iteration over the UTF-8 string s
621 * with UTF-8 iteration boundaries 0 and length.
622 * The implementation counts the UTF-16 index on the fly and
623 * lazily evaluates the UTF-16 length of the text.
624 *
625 * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
626 * When the reservedField is not 0, then it contains a supplementary code point
627 * and the UTF-16 index is between the two corresponding surrogates.
628 * At that point, the UTF-8 index is behind that code point.
629 *
630 * The UTF-8 string pointer s is set into UCharIterator.context without copying
631 * or reallocating the string contents.
632 *
633 * getState() returns a state value consisting of
634 * - the current UTF-8 source byte index (bits 31..1)
635 * - a flag (bit 0) that indicates whether the UChar position is in the middle
636 * of a surrogate pair
637 * (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
638 *
639 * getState() cannot also encode the UTF-16 index in the state value.
640 * move(relative to limit or length), or
641 * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
642 *
643 * @param iter UCharIterator structure to be set for iteration
644 * @param s UTF-8 string to iterate over
645 * @param length Length of s in bytes, or -1 if NUL-terminated
646 *
647 * @see UCharIterator
648 * @stable ICU 2.6
649 */
650U_STABLE void U_EXPORT2
651uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);
652
653#if U_SHOW_CPLUSPLUS_API
654
655/**
656 * Set up a UCharIterator to wrap around a C++ CharacterIterator.
657 *
658 * Sets the UCharIterator function pointers for iteration using the
659 * CharacterIterator charIter.
660 *
661 * The CharacterIterator pointer charIter is set into UCharIterator.context
662 * without copying or cloning the CharacterIterator object.
663 * The other "protected" UCharIterator fields are set to 0 and will be ignored.
664 * The iteration index and boundaries are controlled by the CharacterIterator.
665 *
666 * getState() simply returns the current index.
667 * move() will always return the final index.
668 *
669 * @param iter UCharIterator structure to be set for iteration
670 * @param charIter CharacterIterator to wrap
671 *
672 * @see UCharIterator
673 * @stable ICU 2.1
674 */
675U_STABLE void U_EXPORT2
676uiter_setCharacterIterator(UCharIterator *iter, icu::CharacterIterator *charIter);
677
678/**
679 * Set up a UCharIterator to iterate over a C++ Replaceable.
680 *
681 * Sets the UCharIterator function pointers for iteration over the
682 * Replaceable rep with iteration boundaries start=index=0 and
683 * length=limit=rep->length().
684 * The "provider" may set the start, index, and limit values at any time
685 * within the range 0..length=rep->length().
686 * The length field will be ignored.
687 *
688 * The Replaceable pointer rep is set into UCharIterator.context without copying
689 * or cloning/reallocating the Replaceable object.
690 *
691 * getState() simply returns the current index.
692 * move() will always return the final index.
693 *
694 * @param iter UCharIterator structure to be set for iteration
695 * @param rep Replaceable to iterate over
696 *
697 * @see UCharIterator
698 * @stable ICU 2.1
699 */
700U_STABLE void U_EXPORT2
701uiter_setReplaceable(UCharIterator *iter, const icu::Replaceable *rep);
702
703#endif
704
705U_CDECL_END
706
707#endif
708