ubrk.h source code [include/x86_64-linux-gnu/unicode/ubrk.h]

1	/*
2	******************************************************************************
3	* Copyright (C) 1996-2014, International Business Machines Corporation and others.
4	* All Rights Reserved.
5	******************************************************************************
6	*/
7
8	#ifndef UBRK_H
9	#define UBRK_H
10
11	#include "unicode/utypes.h"
12	#include "unicode/uloc.h"
13	#include "unicode/utext.h"
14	#include "unicode/localpointer.h"
15
16	/**
17	* A text-break iterator.
18	* For usage in C programs.
19	*/
20	#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
21	# define UBRK_TYPEDEF_UBREAK_ITERATOR
22	/**
23	* Opaque type representing an ICU Break iterator object.
24	* @stable ICU 2.0
25	*/
26	typedef struct UBreakIterator UBreakIterator;
27	#endif
28
29	#if !UCONFIG_NO_BREAK_ITERATION
30
31	#include "unicode/parseerr.h"
32
33	/**
34	* \file
35	* \brief C API: BreakIterator
36	*
37	* <h2> BreakIterator C API </h2>
38	*
39	* The BreakIterator C API defines methods for finding the location
40	* of boundaries in text. Pointer to a UBreakIterator maintain a
41	* current position and scan over text returning the index of characters
42	* where boundaries occur.
43	* <p>
44	* Line boundary analysis determines where a text string can be broken
45	* when line-wrapping. The mechanism correctly handles punctuation and
46	* hyphenated words.
47	* <p>
48	* Sentence boundary analysis allows selection with correct
49	* interpretation of periods within numbers and abbreviations, and
50	* trailing punctuation marks such as quotation marks and parentheses.
51	* <p>
52	* Word boundary analysis is used by search and replace functions, as
53	* well as within text editing applications that allow the user to
54	* select words with a double click. Word selection provides correct
55	* interpretation of punctuation marks within and following
56	* words. Characters that are not part of a word, such as symbols or
57	* punctuation marks, have word-breaks on both sides.
58	* <p>
59	* Character boundary analysis identifies the boundaries of
60	* "Extended Grapheme Clusters", which are groupings of codepoints
61	* that should be treated as character-like units for many text operations.
62	* Please see Unicode Standard Annex #29, Unicode Text Segmentation,
63	* http://www.unicode.org/reports/tr29/ for additional information
64	* on grapheme clusters and guidelines on their use.
65	* <p>
66	* Title boundary analysis locates all positions,
67	* typically starts of words, that should be set to Title Case
68	* when title casing the text.
69	* <p>
70	* The text boundary positions are found according to the rules
71	* described in Unicode Standard Annex #29, Text Boundaries, and
72	* Unicode Standard Annex #14, Line Breaking Properties. These
73	* are available at http://www.unicode.org/reports/tr14/ and
74	* http://www.unicode.org/reports/tr29/.
75	* <p>
76	* In addition to the plain C API defined in this header file, an
77	* object oriented C++ API with equivalent functionality is defined in the
78	* file brkiter.h.
79	* <p>
80	* Code snippets illustrating the use of the Break Iterator APIs
81	* are available in the ICU User Guide,
82	* http://icu-project.org/userguide/boundaryAnalysis.html
83	* and in the sample program icu/source/samples/break/break.cpp
84	*/
85
86	/* The possible types of text boundaries. @stable ICU 2.0 /
87	typedef enum UBreakIteratorType {
88	/* Character breaks @stable ICU 2.0 /
89	UBRK_CHARACTER = `0`,
90	/* Word breaks @stable ICU 2.0 /
91	UBRK_WORD = `1`,
92	/* Line breaks @stable ICU 2.0 /
93	UBRK_LINE = `2`,
94	/* Sentence breaks @stable ICU 2.0 /
95	UBRK_SENTENCE = `3`,
96
97	#ifndef U_HIDE_DEPRECATED_API
98	/**
99	* Title Case breaks
100	* The iterator created using this type locates title boundaries as described for
101	* Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
102	* please use Word Boundary iterator.
103	*
104	* @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
105	*/
106	UBRK_TITLE = `4`,
107	#endif /* U_HIDE_DEPRECATED_API */
108	UBRK_COUNT = `5`
109	} UBreakIteratorType;
110
111	/* Value indicating all text boundaries have been returned.*
112	* @stable ICU 2.0
113	*/
114	#define UBRK_DONE ((int32_t) -1)
115
116
117	/**
118	* Enum constants for the word break tags returned by
119	* getRuleStatus(). A range of values is defined for each category of
120	* word, to allow for further subdivisions of a category in future releases.
121	* Applications should check for tag values falling within the range, rather
122	* than for single individual values.
123	* @stable ICU 2.2
124	*/
125	typedef enum UWordBreak {
126	/* Tag value for "words" that do not fit into any of other categories.*
127	* Includes spaces and most punctuation. */
128	UBRK_WORD_NONE = `0`,
129	/* Upper bound for tags for uncategorized words. /
130	UBRK_WORD_NONE_LIMIT = `100`,
131	/* Tag value for words that appear to be numbers, lower limit. /
132	UBRK_WORD_NUMBER = `100`,
133	/* Tag value for words that appear to be numbers, upper limit. /
134	UBRK_WORD_NUMBER_LIMIT = `200`,
135	/* Tag value for words that contain letters, excluding*
136	* hiragana, katakana or ideographic characters, lower limit. */
137	UBRK_WORD_LETTER = `200`,
138	/* Tag value for words containing letters, upper limit /
139	UBRK_WORD_LETTER_LIMIT = `300`,
140	/* Tag value for words containing kana characters, lower limit /
141	UBRK_WORD_KANA = `300`,
142	/* Tag value for words containing kana characters, upper limit /
143	UBRK_WORD_KANA_LIMIT = `400`,
144	/* Tag value for words containing ideographic characters, lower limit /
145	UBRK_WORD_IDEO = `400`,
146	/* Tag value for words containing ideographic characters, upper limit /
147	UBRK_WORD_IDEO_LIMIT = `500`
148	} UWordBreak;
149
150	/**
151	* Enum constants for the line break tags returned by getRuleStatus().
152	* A range of values is defined for each category of
153	* word, to allow for further subdivisions of a category in future releases.
154	* Applications should check for tag values falling within the range, rather
155	* than for single individual values.
156	* @stable ICU 2.8
157	*/
158	typedef enum ULineBreakTag {
159	/* Tag value for soft line breaks, positions at which a line break*
160	* is acceptable but not required */
161	UBRK_LINE_SOFT = `0`,
162	/* Upper bound for soft line breaks. /
163	UBRK_LINE_SOFT_LIMIT = `100`,
164	/* Tag value for a hard, or mandatory line break /
165	UBRK_LINE_HARD = `100`,
166	/* Upper bound for hard line breaks. /
167	UBRK_LINE_HARD_LIMIT = `200`
168	} ULineBreakTag;
169
170
171
172	/**
173	* Enum constants for the sentence break tags returned by getRuleStatus().
174	* A range of values is defined for each category of
175	* sentence, to allow for further subdivisions of a category in future releases.
176	* Applications should check for tag values falling within the range, rather
177	* than for single individual values.
178	* @stable ICU 2.8
179	*/
180	typedef enum USentenceBreakTag {
181	/* Tag value for for sentences ending with a sentence terminator*
182	* ('.', '?', '!', etc.) character, possibly followed by a
183	* hard separator (CR, LF, PS, etc.)
184	*/
185	UBRK_SENTENCE_TERM = `0`,
186	/* Upper bound for tags for sentences ended by sentence terminators. /
187	UBRK_SENTENCE_TERM_LIMIT = `100`,
188	/* Tag value for for sentences that do not contain an ending*
189	* sentence terminator ('.', '?', '!', etc.) character, but
190	* are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
191	*/
192	UBRK_SENTENCE_SEP = `100`,
193	/* Upper bound for tags for sentences ended by a separator. /
194	UBRK_SENTENCE_SEP_LIMIT = `200`
195	/* Tag value for a hard, or mandatory line break /
196	} USentenceBreakTag;
197
198
199	/**
200	* Open a new UBreakIterator for locating text boundaries for a specified locale.
201	* A UBreakIterator may be used for detecting character, line, word,
202	* and sentence breaks in text.
203	* @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
204	* UBRK_LINE, UBRK_SENTENCE
205	* @param locale The locale specifying the text-breaking conventions.
206	* @param text The text to be iterated over.
207	* @param textLength The number of characters in text, or -1 if null-terminated.
208	* @param status A UErrorCode to receive any errors.
209	* @return A UBreakIterator for the specified locale.
210	* @see ubrk_openRules
211	* @stable ICU 2.0
212	*/
213	U_STABLE UBreakIterator* U_EXPORT2
214	ubrk_open(UBreakIteratorType type,
215	const char *locale,
216	const UChar *text,
217	int32_t textLength,
218	UErrorCode *status);
219
220	/**
221	* Open a new UBreakIterator for locating text boundaries using specified breaking rules.
222	* The rule syntax is ... (TBD)
223	* @param rules A set of rules specifying the text breaking conventions.
224	* @param rulesLength The number of characters in rules, or -1 if null-terminated.
225	* @param text The text to be iterated over. May be null, in which case ubrk_setText() is
226	* used to specify the text to be iterated.
227	* @param textLength The number of characters in text, or -1 if null-terminated.
228	* @param parseErr Receives position and context information for any syntax errors
229	* detected while parsing the rules.
230	* @param status A UErrorCode to receive any errors.
231	* @return A UBreakIterator for the specified rules.
232	* @see ubrk_open
233	* @stable ICU 2.2
234	*/
235	U_STABLE UBreakIterator* U_EXPORT2
236	ubrk_openRules(const UChar *rules,
237	int32_t rulesLength,
238	const UChar *text,
239	int32_t textLength,
240	UParseError *parseErr,
241	UErrorCode *status);
242
243	/**
244	* Thread safe cloning operation
245	* @param bi iterator to be cloned
246	* @param stackBuffer <em>Deprecated functionality as of ICU 52, use NULL.</em><br>
247	* user allocated space for the new clone. If NULL new memory will be allocated.
248	* If buffer is not large enough, new memory will be allocated.
249	* Clients can use the U_BRK_SAFECLONE_BUFFERSIZE.
250	* @param pBufferSize <em>Deprecated functionality as of ICU 52, use NULL or 1.</em><br>
251	* pointer to size of allocated space.
252	* If *pBufferSize == 0, a sufficient size for use in cloning will
253	* be returned ('pre-flighting')
254	* If *pBufferSize is not enough for a stack-based safe clone,
255	* new memory will be allocated.
256	* @param status to indicate whether the operation went on smoothly or there were errors
257	* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
258	* @return pointer to the new clone
259	* @stable ICU 2.0
260	*/
261	U_STABLE UBreakIterator * U_EXPORT2
262	ubrk_safeClone(
263	const UBreakIterator *bi,
264	void *stackBuffer,
265	int32_t *pBufferSize,
266	UErrorCode *status);
267
268	#ifndef U_HIDE_DEPRECATED_API
269
270	/**
271	* A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
272	* @deprecated ICU 52. Do not rely on ubrk_safeClone() cloning into any provided buffer.
273	*/
274	#define U_BRK_SAFECLONE_BUFFERSIZE 1
275
276	#endif /* U_HIDE_DEPRECATED_API */
277
278	/**
279	* Close a UBreakIterator.
280	* Once closed, a UBreakIterator may no longer be used.
281	* @param bi The break iterator to close.
282	* @stable ICU 2.0
283	*/
284	U_STABLE void U_EXPORT2
285	ubrk_close(UBreakIterator *bi);
286
287	#if U_SHOW_CPLUSPLUS_API
288
289	U_NAMESPACE_BEGIN
290
291	/**
292	* \class LocalUBreakIteratorPointer
293	* "Smart pointer" class, closes a UBreakIterator via ubrk_close().
294	* For most methods see the LocalPointerBase base class.
295	*
296	* @see LocalPointerBase
297	* @see LocalPointer
298	* @stable ICU 4.4
299	*/
300	U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close);
301
302	U_NAMESPACE_END
303
304	#endif
305
306	/**
307	* Sets an existing iterator to point to a new piece of text
308	* @param bi The iterator to use
309	* @param text The text to be set
310	* @param textLength The length of the text
311	* @param status The error code
312	* @stable ICU 2.0
313	*/
314	U_STABLE void U_EXPORT2
315	ubrk_setText(UBreakIterator* bi,
316	const UChar* text,
317	int32_t textLength,
318	UErrorCode* status);
319
320
321	/**
322	* Sets an existing iterator to point to a new piece of text.
323	*
324	* All index positions returned by break iterator functions are
325	* native indices from the UText. For example, when breaking UTF-8
326	* encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc.
327	* will be UTF-8 string indices, not UTF-16 positions.
328	*
329	* @param bi The iterator to use
330	* @param text The text to be set.
331	* This function makes a shallow clone of the supplied UText. This means
332	* that the caller is free to immediately close or otherwise reuse the
333	* UText that was passed as a parameter, but that the underlying text itself
334	* must not be altered while being referenced by the break iterator.
335	* @param status The error code
336	* @stable ICU 3.4
337	*/
338	U_STABLE void U_EXPORT2
339	ubrk_setUText(UBreakIterator* bi,
340	UText* text,
341	UErrorCode* status);
342
343
344
345	/**
346	* Determine the most recently-returned text boundary.
347	*
348	* @param bi The break iterator to use.
349	* @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
350	* \ref ubrk_first, or \ref ubrk_last.
351	* @stable ICU 2.0
352	*/
353	U_STABLE int32_t U_EXPORT2
354	ubrk_current(const UBreakIterator *bi);
355
356	/**
357	* Advance the iterator to the boundary following the current boundary.
358	*
359	* @param bi The break iterator to use.
360	* @return The character index of the next text boundary, or UBRK_DONE
361	* if all text boundaries have been returned.
362	* @see ubrk_previous
363	* @stable ICU 2.0
364	*/
365	U_STABLE int32_t U_EXPORT2
366	ubrk_next(UBreakIterator *bi);
367
368	/**
369	* Set the iterator position to the boundary preceding the current boundary.
370	*
371	* @param bi The break iterator to use.
372	* @return The character index of the preceding text boundary, or UBRK_DONE
373	* if all text boundaries have been returned.
374	* @see ubrk_next
375	* @stable ICU 2.0
376	*/
377	U_STABLE int32_t U_EXPORT2
378	ubrk_previous(UBreakIterator *bi);
379
380	/**
381	* Set the iterator position to zero, the start of the text being scanned.
382	* @param bi The break iterator to use.
383	* @return The new iterator position (zero).
384	* @see ubrk_last
385	* @stable ICU 2.0
386	*/
387	U_STABLE int32_t U_EXPORT2
388	ubrk_first(UBreakIterator *bi);
389
390	/**
391	* Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned.
392	* This is not the same as the last character.
393	* @param bi The break iterator to use.
394	* @return The character offset immediately <EM>beyond</EM> the last character in the
395	* text being scanned.
396	* @see ubrk_first
397	* @stable ICU 2.0
398	*/
399	U_STABLE int32_t U_EXPORT2
400	ubrk_last(UBreakIterator *bi);
401
402	/**
403	* Set the iterator position to the first boundary preceding the specified offset.
404	* The new position is always smaller than offset, or UBRK_DONE.
405	* @param bi The break iterator to use.
406	* @param offset The offset to begin scanning.
407	* @return The text boundary preceding offset, or UBRK_DONE.
408	* @see ubrk_following
409	* @stable ICU 2.0
410	*/
411	U_STABLE int32_t U_EXPORT2
412	ubrk_preceding(UBreakIterator *bi,
413	int32_t offset);
414
415	/**
416	* Advance the iterator to the first boundary following the specified offset.
417	* The value returned is always greater than offset, or UBRK_DONE.
418	* @param bi The break iterator to use.
419	* @param offset The offset to begin scanning.
420	* @return The text boundary following offset, or UBRK_DONE.
421	* @see ubrk_preceding
422	* @stable ICU 2.0
423	*/
424	U_STABLE int32_t U_EXPORT2
425	ubrk_following(UBreakIterator *bi,
426	int32_t offset);
427
428	/**
429	* Get a locale for which text breaking information is available.
430	* A UBreakIterator in a locale returned by this function will perform the correct
431	* text breaking for the locale.
432	* @param index The index of the desired locale.
433	* @return A locale for which number text breaking information is available, or 0 if none.
434	* @see ubrk_countAvailable
435	* @stable ICU 2.0
436	*/
437	U_STABLE const char* U_EXPORT2
438	ubrk_getAvailable(int32_t index);
439
440	/**
441	* Determine how many locales have text breaking information available.
442	* This function is most useful as determining the loop ending condition for
443	* calls to \ref ubrk_getAvailable.
444	* @return The number of locales for which text breaking information is available.
445	* @see ubrk_getAvailable
446	* @stable ICU 2.0
447	*/
448	U_STABLE int32_t U_EXPORT2
449	ubrk_countAvailable(void);
450
451
452	/**
453	* Returns true if the specfied position is a boundary position. As a side
454	* effect, leaves the iterator pointing to the first boundary position at
455	* or after "offset".
456	* @param bi The break iterator to use.
457	* @param offset the offset to check.
458	* @return True if "offset" is a boundary position.
459	* @stable ICU 2.0
460	*/
461	U_STABLE UBool U_EXPORT2
462	ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
463
464	/**
465	* Return the status from the break rule that determined the most recently
466	* returned break position. The values appear in the rule source
467	* within brackets, {123}, for example. For rules that do not specify a
468	* status, a default value of 0 is returned.
469	* <p>
470	* For word break iterators, the possible values are defined in enum UWordBreak.
471	* @stable ICU 2.2
472	*/
473	U_STABLE int32_t U_EXPORT2
474	ubrk_getRuleStatus(UBreakIterator *bi);
475
476	/**
477	* Get the statuses from the break rules that determined the most recently
478	* returned break position. The values appear in the rule source
479	* within brackets, {123}, for example. The default status value for rules
480	* that do not explicitly provide one is zero.
481	* <p>
482	* For word break iterators, the possible values are defined in enum UWordBreak.
483	* @param bi The break iterator to use
484	* @param fillInVec an array to be filled in with the status values.
485	* @param capacity the length of the supplied vector. A length of zero causes
486	* the function to return the number of status values, in the
487	* normal way, without attemtping to store any values.
488	* @param status receives error codes.
489	* @return The number of rule status values from rules that determined
490	* the most recent boundary returned by the break iterator.
491	* @stable ICU 3.0
492	*/
493	U_STABLE int32_t U_EXPORT2
494	ubrk_getRuleStatusVec(UBreakIterator bi, int32_t fillInVec, int32_t capacity, UErrorCode *status);
495
496	/**
497	* Return the locale of the break iterator. You can choose between the valid and
498	* the actual locale.
499	* @param bi break iterator
500	* @param type locale type (valid or actual)
501	* @param status error code
502	* @return locale string
503	* @stable ICU 2.8
504	*/
505	U_STABLE const char* U_EXPORT2
506	ubrk_getLocaleByType(const UBreakIterator bi, ULocDataLocaleType type, UErrorCode status);
507
508	/**
509	* Set the subject text string upon which the break iterator is operating
510	* without changing any other aspect of the state.
511	* The new and previous text strings must have the same content.
512	*
513	* This function is intended for use in environments where ICU is operating on
514	* strings that may move around in memory. It provides a mechanism for notifying
515	* ICU that the string has been relocated, and providing a new UText to access the
516	* string in its new position.
517	*
518	* Note that the break iterator never copies the underlying text
519	* of a string being processed, but always operates directly on the original text
520	* provided by the user. Refreshing simply drops the references to the old text
521	* and replaces them with references to the new.
522	*
523	* Caution: this function is normally used only by very specialized
524	* system-level code. One example use case is with garbage collection
525	* that moves the text in memory.
526	*
527	* @param bi The break iterator.
528	* @param text The new (moved) text string.
529	* @param status Receives errors detected by this function.
530	*
531	* @stable ICU 49
532	*/
533	U_STABLE void U_EXPORT2
534	ubrk_refreshUText(UBreakIterator *bi,
535	UText *text,
536	UErrorCode *status);
537
538	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
539
540	#endif
541

Browse the source code of include/x86_64-linux-gnu/unicode/ubrk.h