blob: ba51fe228377155446ac3e728ababf488ca08fe6 [file] [log] [blame]
// Copyright (C) 2009-2010 David Sugar, Tycho Softworks.
// This file is part of GNU uCommon C++.
// GNU uCommon C++ is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published
// by the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// GNU uCommon C++ is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU Lesser General Public License for more details.
// You should have received a copy of the GNU Lesser General Public License
// along with GNU uCommon C++. If not, see <>.
* Basic UCommon Unicode support.
* This includes computing unicode transcoding and supporting a
* UTF8-aware string class (UString). We may add support for a wchar_t
* aware string class as well, as some external api libraries may require
* ucs-2 or 4 encoded strings.
* @file ucommon/unicode.h
* An example of some unicode-utf8 transcoding.
* @example unicode.cpp
#include <ucommon/string.h>
* 32 bit unicode character code. We may extract this from a ucs2 or utf8
* string.
typedef int32_t ucs4_t;
* 16 bit unicode character code. Java and some api's like these.
typedef int16_t ucs2_t;
* Resolves issues where wchar_t is not defined.
typedef void *unicode_t;
* A core class of ut8 encoded string functions. This is a foundation for
* all utf8 string processing.
* @author David Sugar
class __EXPORT utf8
* Size of "unicode_t" character codes, may not be ucs4_t size.
static const unsigned ucsize;
* A convenient NULL pointer value.
static const char *nil;
* Compute character size of utf8 string codepoint.
* @param codepoint in string.
* @return size of codepoint as utf8 encoded data, 0 if invalid.
static unsigned size(const char *codepoint);
* Count ut8 encoded ucs4 codepoints in string.
* @param string of utf8 data.
* @return codepount count, 0 if empty or invalid.
static size_t count(const char *string);
* Get codepoint offset in a string.
* @param string of utf8 data.
* @param position of codepoint in string, negative offsets are from tail.
* @return offset of codepoint or NULL if invalid.
static char *offset(char *string, ssize_t position);
* Convert a utf8 encoded codepoint to a ucs4 character value.
* @param encoded utf8 codepoint.
* @return ucs4 string or 0 if invalid.
static ucs4_t codepoint(const char *encoded);
* How many chars requires to encode a given wchar string.
* @param string of ucs4 data.
* @return number of chars required to encode given string.
static size_t chars(const unicode_t string);
* How many chars requires to encode a given unicode character.
* @param character to encode.
* @return number of chars required to encode given character.
static size_t chars(ucs4_t character);
* Convert a unicode string into utf8.
* @param string of unicode data to pack
* @param buffer of character protocol to put data into.
* @return number of code points converted.
static size_t unpack(const unicode_t string, CharacterProtocol& buffer);
* Convert a utf8 string into a unicode data buffer.
* @param unicode data buffer.
* @param buffer of character protocol to pack from.
* @param size of unicode data buffer in codepoints.
* @return number of code points converted.
static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size);
* Dup a utf8 string into a ucs4_t string.
static ucs4_t *udup(const char *string);
* Dup a utf8 string into a ucs2_t representation.
static ucs2_t *wdup(const char *string);
* Find first occurance of character in string.
* @param string to search in.
* @param character code to search for.
* @param start offset in string in codepoints.
* @return pointer to first instance or NULL if not found.
static const char *find(const char *string, ucs4_t character, size_t start = 0);
* Find last occurrence of character in string.
* @param string to search in.
* @param character code to search for.
* @param end offset to start from in codepoints.
* @return pointer to last instance or NULL if not found.
static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l);
* Count occurrences of a unicode character in string.
* @param string to search in.
* @param character code to search for.
* @return count of occurrences.
static unsigned ccount(const char *string, ucs4_t character);
* Get a unicode character from a character protocol.
* @param buffer of character protocol to read from.
* @return unicode character or EOF error.
static ucs4_t get(CharacterProtocol& buffer);
* Push a unicode character to a character protocol.
* @param character to push to file.
* @param buffer of character protocol to push character to.
* @return unicode character or EOF on error.
static ucs4_t put(ucs4_t character, CharacterProtocol& buffer);
* A copy-on-write utf8 string class that operates by reference count. This
* is derived from the classic uCommon String class by adding operations that
* are utf8 encoding aware.
* @author David Sugar <>
class __EXPORT UString : public String, public utf8
* Create a new empty utf8 aware string object.
* Create an empty string with a buffer pre-allocated to a specified size.
* @param size of buffer to allocate.
UString(strsize_t size);
* Create a utf8 aware string for a null terminated unicode string.
* @param text of ucs4 encoded data.
UString(const unicode_t text);
* Create a string from null terminated text up to a maximum specified
* size.
* @param text to use for string.
* @param size limit of new string.
UString(const char *text, strsize_t size);
* Create a string for a substring. The end of the substring is a
* pointer within the substring itself.
* @param text to use for string.
* @param end of text in substring.
UString(const unicode_t *text, const unicode_t *end);
* Construct a copy of a string object. Our copy inherits the same
* reference counted instance of cstring as in the original.
* @param existing string to copy from.
UString(const UString& existing);
* Destroy string. De-reference cstring. If last reference to cstring,
* then also remove cstring from heap.
virtual ~UString();
* Get a new string object as a substring of the current object.
* @param codepoint offset of substring.
* @param size of substring in codepoints or 0 if to end.
* @return string object holding substring.
UString get(strsize_t codepoint, strsize_t size = 0) const;
* Extract a unicode byte sequence from utf8 object.
* @param unicode data buffer.
* @param size of data buffer.
* @return codepoints copied.
size_t get(unicode_t unicode, size_t size) const;
* Set a utf8 encoded string based on unicode data.
* @param unicode text to set.
void set(const unicode_t unicode);
* Add (append) unicode to a utf8 encoded string.
* @param unicode text to add.
void add(const unicode_t unicode);
* Return unicode character found at a specific codepoint in the string.
* @param position of codepoint in string, negative values computed from end.
* @return character code at specified position in string.
ucs4_t at(int position) const;
* Extract a unicode byte sequence from utf8 object.
* @param unicode data buffer.
* @param size of data buffer.
* @return codepoints copied.
inline size_t operator()(unicode_t unicode, size_t size) const
{return get(unicode, size);};
* Get a new substring through object expression.
* @param codepoint offset of substring.
* @param size of substring or 0 if to end.
* @return string object holding substring.
UString operator()(int codepoint, strsize_t size) const;
* Convenience method for left of string.
* @param size of substring to gather in codepoints.
* @return string object holding substring.
inline UString left(strsize_t size) const
{return operator()(0, size);}
* Convenience method for right of string.
* @param offset of substring from right in codepoints.
* @return string object holding substring.
inline UString right(strsize_t offset) const
{return operator()(-((int)offset), 0);}
* Convenience method for substring extraction.
* @param offset into string.
* @param size of string to return.
* @return string object holding substring.
inline UString copy(strsize_t offset, strsize_t size) const
{return operator()((int)offset, size);}
* Cut (remove) text from string using codepoint offsets.
* @param offset to start of text field to remove.
* @param size of text field to remove or 0 to remove to end of string.
void cut(strsize_t offset, strsize_t size = 0);
* Insert (paste) text into string using codepoint offsets.
* @param offset to start paste.
* @param text to paste.
* @param size of text to paste.
void paste(strsize_t offset, const char *text, strsize_t size = 0);
* Reference a string in the object by codepoint offset. Positive
* offsets are from the start of the string, negative from the
* end.
* @param offset to string position.
* @return pointer to string data or NULL if invalid offset.
const char *operator()(int offset) const;
* Reference a unicode character in string object by array offset.
* @param position of codepoint offset to character.
* @return character value at offset.
inline ucs4_t operator[](int position) const
{return UString::at(position);};
* Count codepoints in current string.
* @return count of codepoints.
inline strsize_t count(void) const
{return utf8::count(str->text);}
* Count occurrences of a unicode character in string.
* @param character code to search for.
* @return count of occurrences.
unsigned ccount(ucs4_t character) const;
* Find first occurrence of character in string.
* @param character code to search for.
* @param start offset in string in codepoints.
* @return pointer to first instance or NULL if not found.
const char *find(ucs4_t character, strsize_t start = 0) const;
* Find last occurrence of character in string.
* @param character code to search for.
* @param end offset to start from in codepoints.
* @return pointer to last instance or NULL if not found.
const char *rfind(ucs4_t character, strsize_t end = npos) const;
* Pointer to utf8 encoded character data. This is a kind of "char *" for
* utf8 text.
* @author David Sugar <>
class __EXPORT utf8_pointer
uint8_t *text;
* Create a utf8 pointer set to NULL.
* Create a utf8 pointer for an existing char pointer.
* @param string pointer to use.
utf8_pointer(const char *string);
* Create a utf8 pointer as a copy of existing utf8 pointer.
* @param copy of object to use.
utf8_pointer(const utf8_pointer& copy);
* Iterative increment of a utf8 pointer to prior codepoint.
* @return object incremented.
utf8_pointer& operator ++();
* Iterative decrement of a utf8 pointer to next codepoint.
* @return object decremented.
utf8_pointer& operator --();
* Adjust utf8 pointer by specified codepoints forward.
* @param offset to increment by.
* @return object incremented.
utf8_pointer& operator +=(long offset);
* Adjust utf8 pointer by specified codepoints backward.
* @param offset to decrement by.
* @return object decremented.
utf8_pointer& operator -=(long offset);
* Get new utf8 string after adding a codepoint offset.
* @param offset to add.
* @return new utf8 pointer pointing to specified offset.
utf8_pointer operator+(long offset) const;
* Get new utf8 string after subtracting a codepoint offset.
* @param offset to subtract.
* @return new utf8 pointer pointing to specified offset.
utf8_pointer operator-(long offset) const;
* Check if text is valid pointer.
* @return true if not NULL.
inline operator bool() const
{return text != NULL;};
* Check if text is an invalid pointer.
* @return false if not NULL.
inline bool operator!() const
{return text == NULL;};
* Extract a unicode character from a specified codepoint.
* @param codepoint offset to extract character from.
* @return unicode character or 0.
ucs4_t operator[](long codepoint) const;
* Assign a utf8 string to point to.
* @param string to point to.
* @return current object after set to string.
utf8_pointer& operator=(const char *string);
* Iterative increment of a utf8 pointer to next codepoint.
void inc(void);
* Iterative decrement of a utf8 pointer to prior codepoint.
void dec(void);
* check if pointer equals another string.
* @param string to check.
* @return true if same memory address.
inline bool operator==(const char *string) const
{return (const char *)text == string;};
* check if pointer does not equal another string.
* @param string to check.
* @return false if same memory address.
inline bool operator!=(const char *string) const
{return (const char *)text != string;};
* Get unicode character pointed to by pointer.
* @return unicode character we are pointing to.
inline ucs4_t operator*() const
{return utf8::codepoint((const char *)text);};
* Get c string we point to.
* @return string we point to.
inline char *c_str(void) const
{return (char *)text;};
* Convert utf8 pointer to a generic string pointer.
* @return generic string pointer.
inline operator char*() const
{return (char *)text;};
* Get length of null terminated utf8 string in codepoints.
* @return codepoint length of string.
inline size_t len(void) const
{return utf8::count((const char *)text);};
inline ucs4_t *strudup(const char *string)
{return utf8::udup(string);}
inline ucs2_t *strwdup(const char *string)
{return utf8::wdup(string);}
__EXPORT unicode_t unidup(const char *string);
inline void dupfree<ucs2_t*>(ucs2_t *string)
inline void dupfree<ucs4_t*>(ucs4_t *string)
inline void dupfree<unicode_t>(unicode_t string)
* Convenience type for utf8 encoded strings.
typedef UString ustring_t;
* Convenience type for utf8_pointer strings.
typedef utf8_pointer utf8_t;