blob: ba51fe228377155446ac3e728ababf488ca08fe6 [file] [log] [blame]
Alexandre Lisionddd731e2014-01-31 11:50:08 -05001// Copyright (C) 2009-2010 David Sugar, Tycho Softworks.
2//
3// This file is part of GNU uCommon C++.
4//
5// GNU uCommon C++ is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Lesser General Public License as published
7// by the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// GNU uCommon C++ is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Lesser General Public License for more details.
14//
15// You should have received a copy of the GNU Lesser General Public License
16// along with GNU uCommon C++. If not, see <http://www.gnu.org/licenses/>.
17
18/**
19 * Basic UCommon Unicode support.
20 * This includes computing unicode transcoding and supporting a
21 * UTF8-aware string class (UString). We may add support for a wchar_t
22 * aware string class as well, as some external api libraries may require
23 * ucs-2 or 4 encoded strings.
24 * @file ucommon/unicode.h
25 */
26
27/**
28 * An example of some unicode-utf8 transcoding.
29 * @example unicode.cpp
30 */
31
32#ifndef _UCOMMON_UNICODE_H_
33#define _UCOMMON_UNICODE_H_
34
35#ifndef _UCOMMON_STRING_H_
36#include <ucommon/string.h>
37#endif
38
39NAMESPACE_UCOMMON
40
41/**
42 * 32 bit unicode character code. We may extract this from a ucs2 or utf8
43 * string.
44 */
45typedef int32_t ucs4_t;
46
47/**
48 * 16 bit unicode character code. Java and some api's like these.
49 */
50typedef int16_t ucs2_t;
51
52/**
53 * Resolves issues where wchar_t is not defined.
54 */
55typedef void *unicode_t;
56
57/**
58 * A core class of ut8 encoded string functions. This is a foundation for
59 * all utf8 string processing.
60 * @author David Sugar
61 */
62class __EXPORT utf8
63{
64public:
65 /**
66 * Size of "unicode_t" character codes, may not be ucs4_t size.
67 */
68 static const unsigned ucsize;
69
70 /**
71 * A convenient NULL pointer value.
72 */
73 static const char *nil;
74
75 /**
76 * Compute character size of utf8 string codepoint.
77 * @param codepoint in string.
78 * @return size of codepoint as utf8 encoded data, 0 if invalid.
79 */
80 static unsigned size(const char *codepoint);
81
82 /**
83 * Count ut8 encoded ucs4 codepoints in string.
84 * @param string of utf8 data.
85 * @return codepount count, 0 if empty or invalid.
86 */
87 static size_t count(const char *string);
88
89 /**
90 * Get codepoint offset in a string.
91 * @param string of utf8 data.
92 * @param position of codepoint in string, negative offsets are from tail.
93 * @return offset of codepoint or NULL if invalid.
94 */
95 static char *offset(char *string, ssize_t position);
96
97 /**
98 * Convert a utf8 encoded codepoint to a ucs4 character value.
99 * @param encoded utf8 codepoint.
100 * @return ucs4 string or 0 if invalid.
101 */
102 static ucs4_t codepoint(const char *encoded);
103
104 /**
105 * How many chars requires to encode a given wchar string.
106 * @param string of ucs4 data.
107 * @return number of chars required to encode given string.
108 */
109 static size_t chars(const unicode_t string);
110
111 /**
112 * How many chars requires to encode a given unicode character.
113 * @param character to encode.
114 * @return number of chars required to encode given character.
115 */
116 static size_t chars(ucs4_t character);
117
118 /**
119 * Convert a unicode string into utf8.
120 * @param string of unicode data to pack
121 * @param buffer of character protocol to put data into.
122 * @return number of code points converted.
123 */
124 static size_t unpack(const unicode_t string, CharacterProtocol& buffer);
125
126 /**
127 * Convert a utf8 string into a unicode data buffer.
128 * @param unicode data buffer.
129 * @param buffer of character protocol to pack from.
130 * @param size of unicode data buffer in codepoints.
131 * @return number of code points converted.
132 */
133 static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size);
134
135 /**
136 * Dup a utf8 string into a ucs4_t string.
137 */
138 static ucs4_t *udup(const char *string);
139
140 /**
141 * Dup a utf8 string into a ucs2_t representation.
142 */
143 static ucs2_t *wdup(const char *string);
144
145 /**
146 * Find first occurance of character in string.
147 * @param string to search in.
148 * @param character code to search for.
149 * @param start offset in string in codepoints.
150 * @return pointer to first instance or NULL if not found.
151 */
152 static const char *find(const char *string, ucs4_t character, size_t start = 0);
153
154 /**
155 * Find last occurrence of character in string.
156 * @param string to search in.
157 * @param character code to search for.
158 * @param end offset to start from in codepoints.
159 * @return pointer to last instance or NULL if not found.
160 */
161 static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l);
162
163 /**
164 * Count occurrences of a unicode character in string.
165 * @param string to search in.
166 * @param character code to search for.
167 * @return count of occurrences.
168 */
169 static unsigned ccount(const char *string, ucs4_t character);
170
171 /**
172 * Get a unicode character from a character protocol.
173 * @param buffer of character protocol to read from.
174 * @return unicode character or EOF error.
175 */
176 static ucs4_t get(CharacterProtocol& buffer);
177
178 /**
179 * Push a unicode character to a character protocol.
180 * @param character to push to file.
181 * @param buffer of character protocol to push character to.
182 * @return unicode character or EOF on error.
183 */
184 static ucs4_t put(ucs4_t character, CharacterProtocol& buffer);
185};
186
187/**
188 * A copy-on-write utf8 string class that operates by reference count. This
189 * is derived from the classic uCommon String class by adding operations that
190 * are utf8 encoding aware.
191 * @author David Sugar <dyfet@gnutelephony.org>
192 */
193class __EXPORT UString : public String, public utf8
194{
195protected:
196 /**
197 * Create a new empty utf8 aware string object.
198 */
199 UString();
200
201 /**
202 * Create an empty string with a buffer pre-allocated to a specified size.
203 * @param size of buffer to allocate.
204 */
205 UString(strsize_t size);
206
207 /**
208 * Create a utf8 aware string for a null terminated unicode string.
209 * @param text of ucs4 encoded data.
210 */
211 UString(const unicode_t text);
212
213 /**
214 * Create a string from null terminated text up to a maximum specified
215 * size.
216 * @param text to use for string.
217 * @param size limit of new string.
218 */
219 UString(const char *text, strsize_t size);
220
221 /**
222 * Create a string for a substring. The end of the substring is a
223 * pointer within the substring itself.
224 * @param text to use for string.
225 * @param end of text in substring.
226 */
227 UString(const unicode_t *text, const unicode_t *end);
228
229 /**
230 * Construct a copy of a string object. Our copy inherits the same
231 * reference counted instance of cstring as in the original.
232 * @param existing string to copy from.
233 */
234 UString(const UString& existing);
235
236 /**
237 * Destroy string. De-reference cstring. If last reference to cstring,
238 * then also remove cstring from heap.
239 */
240 virtual ~UString();
241
242 /**
243 * Get a new string object as a substring of the current object.
244 * @param codepoint offset of substring.
245 * @param size of substring in codepoints or 0 if to end.
246 * @return string object holding substring.
247 */
248 UString get(strsize_t codepoint, strsize_t size = 0) const;
249
250 /**
251 * Extract a unicode byte sequence from utf8 object.
252 * @param unicode data buffer.
253 * @param size of data buffer.
254 * @return codepoints copied.
255 */
256 size_t get(unicode_t unicode, size_t size) const;
257
258 /**
259 * Set a utf8 encoded string based on unicode data.
260 * @param unicode text to set.
261 */
262 void set(const unicode_t unicode);
263
264 /**
265 * Add (append) unicode to a utf8 encoded string.
266 * @param unicode text to add.
267 */
268 void add(const unicode_t unicode);
269
270 /**
271 * Return unicode character found at a specific codepoint in the string.
272 * @param position of codepoint in string, negative values computed from end.
273 * @return character code at specified position in string.
274 */
275 ucs4_t at(int position) const;
276
277 /**
278 * Extract a unicode byte sequence from utf8 object.
279 * @param unicode data buffer.
280 * @param size of data buffer.
281 * @return codepoints copied.
282 */
283 inline size_t operator()(unicode_t unicode, size_t size) const
284 {return get(unicode, size);};
285
286 /**
287 * Get a new substring through object expression.
288 * @param codepoint offset of substring.
289 * @param size of substring or 0 if to end.
290 * @return string object holding substring.
291 */
292 UString operator()(int codepoint, strsize_t size) const;
293
294 /**
295 * Convenience method for left of string.
296 * @param size of substring to gather in codepoints.
297 * @return string object holding substring.
298 */
299 inline UString left(strsize_t size) const
300 {return operator()(0, size);}
301
302 /**
303 * Convenience method for right of string.
304 * @param offset of substring from right in codepoints.
305 * @return string object holding substring.
306 */
307 inline UString right(strsize_t offset) const
308 {return operator()(-((int)offset), 0);}
309
310 /**
311 * Convenience method for substring extraction.
312 * @param offset into string.
313 * @param size of string to return.
314 * @return string object holding substring.
315 */
316 inline UString copy(strsize_t offset, strsize_t size) const
317 {return operator()((int)offset, size);}
318
319 /**
320 * Cut (remove) text from string using codepoint offsets.
321 * @param offset to start of text field to remove.
322 * @param size of text field to remove or 0 to remove to end of string.
323 */
324 void cut(strsize_t offset, strsize_t size = 0);
325
326 /**
327 * Insert (paste) text into string using codepoint offsets.
328 * @param offset to start paste.
329 * @param text to paste.
330 * @param size of text to paste.
331 */
332 void paste(strsize_t offset, const char *text, strsize_t size = 0);
333
334 /**
335 * Reference a string in the object by codepoint offset. Positive
336 * offsets are from the start of the string, negative from the
337 * end.
338 * @param offset to string position.
339 * @return pointer to string data or NULL if invalid offset.
340 */
341 const char *operator()(int offset) const;
342
343 /**
344 * Reference a unicode character in string object by array offset.
345 * @param position of codepoint offset to character.
346 * @return character value at offset.
347 */
348 inline ucs4_t operator[](int position) const
349 {return UString::at(position);};
350
351 /**
352 * Count codepoints in current string.
353 * @return count of codepoints.
354 */
355 inline strsize_t count(void) const
356 {return utf8::count(str->text);}
357
358 /**
359 * Count occurrences of a unicode character in string.
360 * @param character code to search for.
361 * @return count of occurrences.
362 */
363 unsigned ccount(ucs4_t character) const;
364
365 /**
366 * Find first occurrence of character in string.
367 * @param character code to search for.
368 * @param start offset in string in codepoints.
369 * @return pointer to first instance or NULL if not found.
370 */
371 const char *find(ucs4_t character, strsize_t start = 0) const;
372
373 /**
374 * Find last occurrence of character in string.
375 * @param character code to search for.
376 * @param end offset to start from in codepoints.
377 * @return pointer to last instance or NULL if not found.
378 */
379 const char *rfind(ucs4_t character, strsize_t end = npos) const;
380};
381
382/**
383 * Pointer to utf8 encoded character data. This is a kind of "char *" for
384 * utf8 text.
385 * @author David Sugar <dyfet@gnutelephony.org>
386 */
387class __EXPORT utf8_pointer
388{
389protected:
390 uint8_t *text;
391
392public:
393 /**
394 * Create a utf8 pointer set to NULL.
395 */
396 utf8_pointer();
397
398 /**
399 * Create a utf8 pointer for an existing char pointer.
400 * @param string pointer to use.
401 */
402 utf8_pointer(const char *string);
403
404 /**
405 * Create a utf8 pointer as a copy of existing utf8 pointer.
406 * @param copy of object to use.
407 */
408 utf8_pointer(const utf8_pointer& copy);
409
410 /**
411 * Iterative increment of a utf8 pointer to prior codepoint.
412 * @return object incremented.
413 */
414 utf8_pointer& operator ++();
415
416 /**
417 * Iterative decrement of a utf8 pointer to next codepoint.
418 * @return object decremented.
419 */
420 utf8_pointer& operator --();
421
422 /**
423 * Adjust utf8 pointer by specified codepoints forward.
424 * @param offset to increment by.
425 * @return object incremented.
426 */
427 utf8_pointer& operator +=(long offset);
428
429 /**
430 * Adjust utf8 pointer by specified codepoints backward.
431 * @param offset to decrement by.
432 * @return object decremented.
433 */
434 utf8_pointer& operator -=(long offset);
435
436 /**
437 * Get new utf8 string after adding a codepoint offset.
438 * @param offset to add.
439 * @return new utf8 pointer pointing to specified offset.
440 */
441 utf8_pointer operator+(long offset) const;
442
443 /**
444 * Get new utf8 string after subtracting a codepoint offset.
445 * @param offset to subtract.
446 * @return new utf8 pointer pointing to specified offset.
447 */
448 utf8_pointer operator-(long offset) const;
449
450 /**
451 * Check if text is valid pointer.
452 * @return true if not NULL.
453 */
454 inline operator bool() const
455 {return text != NULL;};
456
457 /**
458 * Check if text is an invalid pointer.
459 * @return false if not NULL.
460 */
461 inline bool operator!() const
462 {return text == NULL;};
463
464 /**
465 * Extract a unicode character from a specified codepoint.
466 * @param codepoint offset to extract character from.
467 * @return unicode character or 0.
468 */
469 ucs4_t operator[](long codepoint) const;
470
471 /**
472 * Assign a utf8 string to point to.
473 * @param string to point to.
474 * @return current object after set to string.
475 */
476 utf8_pointer& operator=(const char *string);
477
478 /**
479 * Iterative increment of a utf8 pointer to next codepoint.
480 */
481 void inc(void);
482
483 /**
484 * Iterative decrement of a utf8 pointer to prior codepoint.
485 */
486 void dec(void);
487
488 /**
489 * check if pointer equals another string.
490 * @param string to check.
491 * @return true if same memory address.
492 */
493 inline bool operator==(const char *string) const
494 {return (const char *)text == string;};
495
496 /**
497 * check if pointer does not equal another string.
498 * @param string to check.
499 * @return false if same memory address.
500 */
501 inline bool operator!=(const char *string) const
502 {return (const char *)text != string;};
503
504 /**
505 * Get unicode character pointed to by pointer.
506 * @return unicode character we are pointing to.
507 */
508 inline ucs4_t operator*() const
509 {return utf8::codepoint((const char *)text);};
510
511 /**
512 * Get c string we point to.
513 * @return string we point to.
514 */
515 inline char *c_str(void) const
516 {return (char *)text;};
517
518 /**
519 * Convert utf8 pointer to a generic string pointer.
520 * @return generic string pointer.
521 */
522 inline operator char*() const
523 {return (char *)text;};
524
525 /**
526 * Get length of null terminated utf8 string in codepoints.
527 * @return codepoint length of string.
528 */
529 inline size_t len(void) const
530 {return utf8::count((const char *)text);};
531};
532
533inline ucs4_t *strudup(const char *string)
534 {return utf8::udup(string);}
535
536inline ucs2_t *strwdup(const char *string)
537 {return utf8::wdup(string);}
538
539__EXPORT unicode_t unidup(const char *string);
540
541template<>
542inline void dupfree<ucs2_t*>(ucs2_t *string)
543 {::free(string);}
544
545template<>
546inline void dupfree<ucs4_t*>(ucs4_t *string)
547 {::free(string);}
548
549template<>
550inline void dupfree<unicode_t>(unicode_t string)
551 {::free(string);}
552
553/**
554 * Convenience type for utf8 encoded strings.
555 */
556typedef UString ustring_t;
557
558/**
559 * Convenience type for utf8_pointer strings.
560 */
561typedef utf8_pointer utf8_t;
562
563END_NAMESPACE
564
565#endif