Blame - jni/libucommon/sources/inc/ucommon/unicode.h - jami-client-android

blob: ba51fe228377155446ac3e728ababf488ca08fe6 [file] [log] [blame]

Alexandre Lision	ddd731e	2014-01-31 11:50:08 -0500	[diff] [blame]	1	// Copyright (C) 2009-2010 David Sugar, Tycho Softworks.
				2	//
				3	// This file is part of GNU uCommon C++.
				4	//
				5	// GNU uCommon C++ is free software: you can redistribute it and/or modify
				6	// it under the terms of the GNU Lesser General Public License as published
				7	// by the Free Software Foundation, either version 3 of the License, or
				8	// (at your option) any later version.
				9	//
				10	// GNU uCommon C++ is distributed in the hope that it will be useful,
				11	// but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				13	// GNU Lesser General Public License for more details.
				14	//
				15	// You should have received a copy of the GNU Lesser General Public License
				16	// along with GNU uCommon C++. If not, see <http://www.gnu.org/licenses/>.
				17
				18	/**
				19	* Basic UCommon Unicode support.
				20	* This includes computing unicode transcoding and supporting a
				21	* UTF8-aware string class (UString). We may add support for a wchar_t
				22	* aware string class as well, as some external api libraries may require
				23	* ucs-2 or 4 encoded strings.
				24	* @file ucommon/unicode.h
				25	*/
				26
				27	/**
				28	* An example of some unicode-utf8 transcoding.
				29	* @example unicode.cpp
				30	*/
				31
				32	#ifndef _UCOMMON_UNICODE_H_
				33	#define _UCOMMON_UNICODE_H_
				34
				35	#ifndef _UCOMMON_STRING_H_
				36	#include <ucommon/string.h>
				37	#endif
				38
				39	NAMESPACE_UCOMMON
				40
				41	/**
				42	* 32 bit unicode character code. We may extract this from a ucs2 or utf8
				43	* string.
				44	*/
				45	typedef int32_t ucs4_t;
				46
				47	/**
				48	* 16 bit unicode character code. Java and some api's like these.
				49	*/
				50	typedef int16_t ucs2_t;
				51
				52	/**
				53	* Resolves issues where wchar_t is not defined.
				54	*/
				55	typedef void *unicode_t;
				56
				57	/**
				58	* A core class of ut8 encoded string functions. This is a foundation for
				59	* all utf8 string processing.
				60	* @author David Sugar
				61	*/
				62	class __EXPORT utf8
				63	{
				64	public:
				65	/**
				66	* Size of "unicode_t" character codes, may not be ucs4_t size.
				67	*/
				68	static const unsigned ucsize;
				69
				70	/**
				71	* A convenient NULL pointer value.
				72	*/
				73	static const char *nil;
				74
				75	/**
				76	* Compute character size of utf8 string codepoint.
				77	* @param codepoint in string.
				78	* @return size of codepoint as utf8 encoded data, 0 if invalid.
				79	*/
				80	static unsigned size(const char *codepoint);
				81
				82	/**
				83	* Count ut8 encoded ucs4 codepoints in string.
				84	* @param string of utf8 data.
				85	* @return codepount count, 0 if empty or invalid.
				86	*/
				87	static size_t count(const char *string);
				88
				89	/**
				90	* Get codepoint offset in a string.
				91	* @param string of utf8 data.
				92	* @param position of codepoint in string, negative offsets are from tail.
				93	* @return offset of codepoint or NULL if invalid.
				94	*/
				95	static char offset(char string, ssize_t position);
				96
				97	/**
				98	* Convert a utf8 encoded codepoint to a ucs4 character value.
				99	* @param encoded utf8 codepoint.
				100	* @return ucs4 string or 0 if invalid.
				101	*/
				102	static ucs4_t codepoint(const char *encoded);
				103
				104	/**
				105	* How many chars requires to encode a given wchar string.
				106	* @param string of ucs4 data.
				107	* @return number of chars required to encode given string.
				108	*/
				109	static size_t chars(const unicode_t string);
				110
				111	/**
				112	* How many chars requires to encode a given unicode character.
				113	* @param character to encode.
				114	* @return number of chars required to encode given character.
				115	*/
				116	static size_t chars(ucs4_t character);
				117
				118	/**
				119	* Convert a unicode string into utf8.
				120	* @param string of unicode data to pack
				121	* @param buffer of character protocol to put data into.
				122	* @return number of code points converted.
				123	*/
				124	static size_t unpack(const unicode_t string, CharacterProtocol& buffer);
				125
				126	/**
				127	* Convert a utf8 string into a unicode data buffer.
				128	* @param unicode data buffer.
				129	* @param buffer of character protocol to pack from.
				130	* @param size of unicode data buffer in codepoints.
				131	* @return number of code points converted.
				132	*/
				133	static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size);
				134
				135	/**
				136	* Dup a utf8 string into a ucs4_t string.
				137	*/
				138	static ucs4_t udup(const char string);
				139
				140	/**
				141	* Dup a utf8 string into a ucs2_t representation.
				142	*/
				143	static ucs2_t wdup(const char string);
				144
				145	/**
				146	* Find first occurance of character in string.
				147	* @param string to search in.
				148	* @param character code to search for.
				149	* @param start offset in string in codepoints.
				150	* @return pointer to first instance or NULL if not found.
				151	*/
				152	static const char find(const char string, ucs4_t character, size_t start = 0);
				153
				154	/**
				155	* Find last occurrence of character in string.
				156	* @param string to search in.
				157	* @param character code to search for.
				158	* @param end offset to start from in codepoints.
				159	* @return pointer to last instance or NULL if not found.
				160	*/
				161	static const char rfind(const char string, ucs4_t character, size_t end = (size_t)-1l);
				162
				163	/**
				164	* Count occurrences of a unicode character in string.
				165	* @param string to search in.
				166	* @param character code to search for.
				167	* @return count of occurrences.
				168	*/
				169	static unsigned ccount(const char *string, ucs4_t character);
				170
				171	/**
				172	* Get a unicode character from a character protocol.
				173	* @param buffer of character protocol to read from.
				174	* @return unicode character or EOF error.
				175	*/
				176	static ucs4_t get(CharacterProtocol& buffer);
				177
				178	/**
				179	* Push a unicode character to a character protocol.
				180	* @param character to push to file.
				181	* @param buffer of character protocol to push character to.
				182	* @return unicode character or EOF on error.
				183	*/
				184	static ucs4_t put(ucs4_t character, CharacterProtocol& buffer);
				185	};
				186
				187	/**
				188	* A copy-on-write utf8 string class that operates by reference count. This
				189	* is derived from the classic uCommon String class by adding operations that
				190	* are utf8 encoding aware.
				191	* @author David Sugar <dyfet@gnutelephony.org>
				192	*/
				193	class __EXPORT UString : public String, public utf8
				194	{
				195	protected:
				196	/**
				197	* Create a new empty utf8 aware string object.
				198	*/
				199	UString();
				200
				201	/**
				202	* Create an empty string with a buffer pre-allocated to a specified size.
				203	* @param size of buffer to allocate.
				204	*/
				205	UString(strsize_t size);
				206
				207	/**
				208	* Create a utf8 aware string for a null terminated unicode string.
				209	* @param text of ucs4 encoded data.
				210	*/
				211	UString(const unicode_t text);
				212
				213	/**
				214	* Create a string from null terminated text up to a maximum specified
				215	* size.
				216	* @param text to use for string.
				217	* @param size limit of new string.
				218	*/
				219	UString(const char *text, strsize_t size);
				220
				221	/**
				222	* Create a string for a substring. The end of the substring is a
				223	* pointer within the substring itself.
				224	* @param text to use for string.
				225	* @param end of text in substring.
				226	*/
				227	UString(const unicode_t text, const unicode_t end);
				228
				229	/**
				230	* Construct a copy of a string object. Our copy inherits the same
				231	* reference counted instance of cstring as in the original.
				232	* @param existing string to copy from.
				233	*/
				234	UString(const UString& existing);
				235
				236	/**
				237	* Destroy string. De-reference cstring. If last reference to cstring,
				238	* then also remove cstring from heap.
				239	*/
				240	virtual ~UString();
				241
				242	/**
				243	* Get a new string object as a substring of the current object.
				244	* @param codepoint offset of substring.
				245	* @param size of substring in codepoints or 0 if to end.
				246	* @return string object holding substring.
				247	*/
				248	UString get(strsize_t codepoint, strsize_t size = 0) const;
				249
				250	/**
				251	* Extract a unicode byte sequence from utf8 object.
				252	* @param unicode data buffer.
				253	* @param size of data buffer.
				254	* @return codepoints copied.
				255	*/
				256	size_t get(unicode_t unicode, size_t size) const;
				257
				258	/**
				259	* Set a utf8 encoded string based on unicode data.
				260	* @param unicode text to set.
				261	*/
				262	void set(const unicode_t unicode);
				263
				264	/**
				265	* Add (append) unicode to a utf8 encoded string.
				266	* @param unicode text to add.
				267	*/
				268	void add(const unicode_t unicode);
				269
				270	/**
				271	* Return unicode character found at a specific codepoint in the string.
				272	* @param position of codepoint in string, negative values computed from end.
				273	* @return character code at specified position in string.
				274	*/
				275	ucs4_t at(int position) const;
				276
				277	/**
				278	* Extract a unicode byte sequence from utf8 object.
				279	* @param unicode data buffer.
				280	* @param size of data buffer.
				281	* @return codepoints copied.
				282	*/
				283	inline size_t operator()(unicode_t unicode, size_t size) const
				284	{return get(unicode, size);};
				285
				286	/**
				287	* Get a new substring through object expression.
				288	* @param codepoint offset of substring.
				289	* @param size of substring or 0 if to end.
				290	* @return string object holding substring.
				291	*/
				292	UString operator()(int codepoint, strsize_t size) const;
				293
				294	/**
				295	* Convenience method for left of string.
				296	* @param size of substring to gather in codepoints.
				297	* @return string object holding substring.
				298	*/
				299	inline UString left(strsize_t size) const
				300	{return operator()(0, size);}
				301
				302	/**
				303	* Convenience method for right of string.
				304	* @param offset of substring from right in codepoints.
				305	* @return string object holding substring.
				306	*/
				307	inline UString right(strsize_t offset) const
				308	{return operator()(-((int)offset), 0);}
				309
				310	/**
				311	* Convenience method for substring extraction.
				312	* @param offset into string.
				313	* @param size of string to return.
				314	* @return string object holding substring.
				315	*/
				316	inline UString copy(strsize_t offset, strsize_t size) const
				317	{return operator()((int)offset, size);}
				318
				319	/**
				320	* Cut (remove) text from string using codepoint offsets.
				321	* @param offset to start of text field to remove.
				322	* @param size of text field to remove or 0 to remove to end of string.
				323	*/
				324	void cut(strsize_t offset, strsize_t size = 0);
				325
				326	/**
				327	* Insert (paste) text into string using codepoint offsets.
				328	* @param offset to start paste.
				329	* @param text to paste.
				330	* @param size of text to paste.
				331	*/
				332	void paste(strsize_t offset, const char *text, strsize_t size = 0);
				333
				334	/**
				335	* Reference a string in the object by codepoint offset. Positive
				336	* offsets are from the start of the string, negative from the
				337	* end.
				338	* @param offset to string position.
				339	* @return pointer to string data or NULL if invalid offset.
				340	*/
				341	const char *operator()(int offset) const;
				342
				343	/**
				344	* Reference a unicode character in string object by array offset.
				345	* @param position of codepoint offset to character.
				346	* @return character value at offset.
				347	*/
				348	inline ucs4_t operator[](int position) const
				349	{return UString::at(position);};
				350
				351	/**
				352	* Count codepoints in current string.
				353	* @return count of codepoints.
				354	*/
				355	inline strsize_t count(void) const
				356	{return utf8::count(str->text);}
				357
				358	/**
				359	* Count occurrences of a unicode character in string.
				360	* @param character code to search for.
				361	* @return count of occurrences.
				362	*/
				363	unsigned ccount(ucs4_t character) const;
				364
				365	/**
				366	* Find first occurrence of character in string.
				367	* @param character code to search for.
				368	* @param start offset in string in codepoints.
				369	* @return pointer to first instance or NULL if not found.
				370	*/
				371	const char *find(ucs4_t character, strsize_t start = 0) const;
				372
				373	/**
				374	* Find last occurrence of character in string.
				375	* @param character code to search for.
				376	* @param end offset to start from in codepoints.
				377	* @return pointer to last instance or NULL if not found.
				378	*/
				379	const char *rfind(ucs4_t character, strsize_t end = npos) const;
				380	};
				381
				382	/**
				383	* Pointer to utf8 encoded character data. This is a kind of "char *" for
				384	* utf8 text.
				385	* @author David Sugar <dyfet@gnutelephony.org>
				386	*/
				387	class __EXPORT utf8_pointer
				388	{
				389	protected:
				390	uint8_t *text;
				391
				392	public:
				393	/**
				394	* Create a utf8 pointer set to NULL.
				395	*/
				396	utf8_pointer();
				397
				398	/**
				399	* Create a utf8 pointer for an existing char pointer.
				400	* @param string pointer to use.
				401	*/
				402	utf8_pointer(const char *string);
				403
				404	/**
				405	* Create a utf8 pointer as a copy of existing utf8 pointer.
				406	* @param copy of object to use.
				407	*/
				408	utf8_pointer(const utf8_pointer& copy);
				409
				410	/**
				411	* Iterative increment of a utf8 pointer to prior codepoint.
				412	* @return object incremented.
				413	*/
				414	utf8_pointer& operator ++();
				415
				416	/**
				417	* Iterative decrement of a utf8 pointer to next codepoint.
				418	* @return object decremented.
				419	*/
				420	utf8_pointer& operator --();
				421
				422	/**
				423	* Adjust utf8 pointer by specified codepoints forward.
				424	* @param offset to increment by.
				425	* @return object incremented.
				426	*/
				427	utf8_pointer& operator +=(long offset);
				428
				429	/**
				430	* Adjust utf8 pointer by specified codepoints backward.
				431	* @param offset to decrement by.
				432	* @return object decremented.
				433	*/
				434	utf8_pointer& operator -=(long offset);
				435
				436	/**
				437	* Get new utf8 string after adding a codepoint offset.
				438	* @param offset to add.
				439	* @return new utf8 pointer pointing to specified offset.
				440	*/
				441	utf8_pointer operator+(long offset) const;
				442
				443	/**
				444	* Get new utf8 string after subtracting a codepoint offset.
				445	* @param offset to subtract.
				446	* @return new utf8 pointer pointing to specified offset.
				447	*/
				448	utf8_pointer operator-(long offset) const;
				449
				450	/**
				451	* Check if text is valid pointer.
				452	* @return true if not NULL.
				453	*/
				454	inline operator bool() const
				455	{return text != NULL;};
				456
				457	/**
				458	* Check if text is an invalid pointer.
				459	* @return false if not NULL.
				460	*/
				461	inline bool operator!() const
				462	{return text == NULL;};
				463
				464	/**
				465	* Extract a unicode character from a specified codepoint.
				466	* @param codepoint offset to extract character from.
				467	* @return unicode character or 0.
				468	*/
				469	ucs4_t operator[](long codepoint) const;
				470
				471	/**
				472	* Assign a utf8 string to point to.
				473	* @param string to point to.
				474	* @return current object after set to string.
				475	*/
				476	utf8_pointer& operator=(const char *string);
				477
				478	/**
				479	* Iterative increment of a utf8 pointer to next codepoint.
				480	*/
				481	void inc(void);
				482
				483	/**
				484	* Iterative decrement of a utf8 pointer to prior codepoint.
				485	*/
				486	void dec(void);
				487
				488	/**
				489	* check if pointer equals another string.
				490	* @param string to check.
				491	* @return true if same memory address.
				492	*/
				493	inline bool operator==(const char *string) const
				494	{return (const char *)text == string;};
				495
				496	/**
				497	* check if pointer does not equal another string.
				498	* @param string to check.
				499	* @return false if same memory address.
				500	*/
				501	inline bool operator!=(const char *string) const
				502	{return (const char *)text != string;};
				503
				504	/**
				505	* Get unicode character pointed to by pointer.
				506	* @return unicode character we are pointing to.
				507	*/
				508	inline ucs4_t operator*() const
				509	{return utf8::codepoint((const char *)text);};
				510
				511	/**
				512	* Get c string we point to.
				513	* @return string we point to.
				514	*/
				515	inline char *c_str(void) const
				516	{return (char *)text;};
				517
				518	/**
				519	* Convert utf8 pointer to a generic string pointer.
				520	* @return generic string pointer.
				521	*/
				522	inline operator char*() const
				523	{return (char *)text;};
				524
				525	/**
				526	* Get length of null terminated utf8 string in codepoints.
				527	* @return codepoint length of string.
				528	*/
				529	inline size_t len(void) const
				530	{return utf8::count((const char *)text);};
				531	};
				532
				533	inline ucs4_t strudup(const char string)
				534	{return utf8::udup(string);}
				535
				536	inline ucs2_t strwdup(const char string)
				537	{return utf8::wdup(string);}
				538
				539	__EXPORT unicode_t unidup(const char *string);
				540
				541	template<>
				542	inline void dupfree<ucs2_t>(ucs2_t string)
				543	{::free(string);}
				544
				545	template<>
				546	inline void dupfree<ucs4_t>(ucs4_t string)
				547	{::free(string);}
				548
				549	template<>
				550	inline void dupfree<unicode_t>(unicode_t string)
				551	{::free(string);}
				552
				553	/**
				554	* Convenience type for utf8 encoded strings.
				555	*/
				556	typedef UString ustring_t;
				557
				558	/**
				559	* Convenience type for utf8_pointer strings.
				560	*/
				561	typedef utf8_pointer utf8_t;
				562
				563	END_NAMESPACE
				564
				565	#endif