Blame - jni/libpcre/sources/pcre_valid_utf8.c - jami-client-android

blob: b94bcc98e6bc172ad848131616293ed7b4be7609 [file] [log] [blame]

Tristan Matthews	0461646	2013-11-14 16:09:34 -0500	[diff] [blame]	1	/*************************************************
				2	* Perl-Compatible Regular Expressions *
				3	*************************************************/
				4
				5	/* PCRE is a library of functions to support regular expressions whose syntax
				6	and semantics are as close as possible to those of the Perl 5 language.
				7
				8	Written by Philip Hazel
				9	Copyright (c) 1997-2009 University of Cambridge
				10
				11	-----------------------------------------------------------------------------
				12	Redistribution and use in source and binary forms, with or without
				13	modification, are permitted provided that the following conditions are met:
				14
				15	* Redistributions of source code must retain the above copyright notice,
				16	this list of conditions and the following disclaimer.
				17
				18	* Redistributions in binary form must reproduce the above copyright
				19	notice, this list of conditions and the following disclaimer in the
				20	documentation and/or other materials provided with the distribution.
				21
				22	* Neither the name of the University of Cambridge nor the names of its
				23	contributors may be used to endorse or promote products derived from
				24	this software without specific prior written permission.
				25
				26	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				27	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				28	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				29	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				30	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				31	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				32	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				33	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				34	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				35	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				36	POSSIBILITY OF SUCH DAMAGE.
				37	-----------------------------------------------------------------------------
				38	*/
				39
				40
				41	/* This module contains an internal function for validating UTF-8 character
				42	strings. */
				43
				44
				45	#ifdef HAVE_CONFIG_H
				46	#include "config.h"
				47	#endif
				48
				49	#include "pcre_internal.h"
				50
				51
				52	/*************************************************
				53	* Validate a UTF-8 string *
				54	*************************************************/
				55
				56	/* This function is called (optionally) at the start of compile or match, to
				57	check that a supposed UTF-8 string is actually valid. The early check means
				58	that subsequent code can assume it is dealing with a valid string. The check
				59	can be turned off for maximum performance, but the consequences of supplying an
				60	invalid string are then undefined.
				61
				62	Originally, this function checked according to RFC 2279, allowing for values in
				63	the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in
				64	the canonical format. Once somebody had pointed out RFC 3629 to me (it
				65	obsoletes 2279), additional restrictions were applied. The values are now
				66	limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
				67	subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
				68	characters is still checked.
				69
				70	From release 8.13 more information about the details of the error are passed
				71	back in the returned value:
				72
				73	PCRE_UTF8_ERR0 No error
				74	PCRE_UTF8_ERR1 Missing 1 byte at the end of the string
				75	PCRE_UTF8_ERR2 Missing 2 bytes at the end of the string
				76	PCRE_UTF8_ERR3 Missing 3 bytes at the end of the string
				77	PCRE_UTF8_ERR4 Missing 4 bytes at the end of the string
				78	PCRE_UTF8_ERR5 Missing 5 bytes at the end of the string
				79	PCRE_UTF8_ERR6 2nd-byte's two top bits are not 0x80
				80	PCRE_UTF8_ERR7 3rd-byte's two top bits are not 0x80
				81	PCRE_UTF8_ERR8 4th-byte's two top bits are not 0x80
				82	PCRE_UTF8_ERR9 5th-byte's two top bits are not 0x80
				83	PCRE_UTF8_ERR10 6th-byte's two top bits are not 0x80
				84	PCRE_UTF8_ERR11 5-byte character is not permitted by RFC 3629
				85	PCRE_UTF8_ERR12 6-byte character is not permitted by RFC 3629
				86	PCRE_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted
				87	PCRE_UTF8_ERR14 3-byte character with value 0xd000-0xdfff is not permitted
				88	PCRE_UTF8_ERR15 Overlong 2-byte sequence
				89	PCRE_UTF8_ERR16 Overlong 3-byte sequence
				90	PCRE_UTF8_ERR17 Overlong 4-byte sequence
				91	PCRE_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur)
				92	PCRE_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur)
				93	PCRE_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character)
				94	PCRE_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff
				95
				96	Arguments:
				97	string points to the string
				98	length length of string, or -1 if the string is zero-terminated
				99	errp pointer to an error position offset variable
				100
				101	Returns: = 0 if the string is a valid UTF-8 string
				102	> 0 otherwise, setting the offset of the bad character
				103	*/
				104
				105	int
				106	_pcre_valid_utf8(USPTR string, int length, int *erroroffset)
				107	{
				108	#ifdef SUPPORT_UTF8
				109	register USPTR p;
				110
				111	if (length < 0)
				112	{
				113	for (p = string; *p != 0; p++);
				114	length = (int)(p - string);
				115	}
				116
				117	for (p = string; length-- > 0; p++)
				118	{
				119	register int ab, c, d;
				120
				121	c = *p;
				122	if (c < 128) continue; /* ASCII character */
				123
				124	if (c < 0xc0) /* Isolated 10xx xxxx byte */
				125	{
				126	*erroroffset = (int)(p - string);
				127	return PCRE_UTF8_ERR20;
				128	}
				129
				130	if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */
				131	{
				132	*erroroffset = (int)(p - string);
				133	return PCRE_UTF8_ERR21;
				134	}
				135
				136	ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
				137	if (length < ab)
				138	{
				139	erroroffset = (int)(p - string); / Missing bytes */
				140	return ab - length; /* Codes ERR1 to ERR5 */
				141	}
				142	length -= ab; /* Length remaining */
				143
				144	/* Check top bits in the second byte */
				145
				146	if (((d = *(++p)) & 0xc0) != 0x80)
				147	{
				148	*erroroffset = (int)(p - string) - 1;
				149	return PCRE_UTF8_ERR6;
				150	}
				151
				152	/* For each length, check that the remaining bytes start with the 0x80 bit
				153	set and not the 0x40 bit. Then check for an overlong sequence, and for the
				154	excluded range 0xd800 to 0xdfff. */
				155
				156	switch (ab)
				157	{
				158	/* 2-byte character. No further bytes to check for 0x80. Check first byte
				159	for for xx00 000x (overlong sequence). */
				160
				161	case 1: if ((c & 0x3e) == 0)
				162	{
				163	*erroroffset = (int)(p - string) - 1;
				164	return PCRE_UTF8_ERR15;
				165	}
				166	break;
				167
				168	/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes
				169	for 1110 0000, xx0x xxxx (overlong sequence) or
				170	1110 1101, 1010 xxxx (0xd800 - 0xdfff) */
				171
				172	case 2:
				173	if (((++p) & 0xc0) != 0x80) / Third byte */
				174	{
				175	*erroroffset = (int)(p - string) - 2;
				176	return PCRE_UTF8_ERR7;
				177	}
				178	if (c == 0xe0 && (d & 0x20) == 0)
				179	{
				180	*erroroffset = (int)(p - string) - 2;
				181	return PCRE_UTF8_ERR16;
				182	}
				183	if (c == 0xed && d >= 0xa0)
				184	{
				185	*erroroffset = (int)(p - string) - 2;
				186	return PCRE_UTF8_ERR14;
				187	}
				188	break;
				189
				190	/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
				191	bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a
				192	character greater than 0x0010ffff (f4 8f bf bf) */
				193
				194	case 3:
				195	if (((++p) & 0xc0) != 0x80) / Third byte */
				196	{
				197	*erroroffset = (int)(p - string) - 2;
				198	return PCRE_UTF8_ERR7;
				199	}
				200	if (((++p) & 0xc0) != 0x80) / Fourth byte */
				201	{
				202	*erroroffset = (int)(p - string) - 3;
				203	return PCRE_UTF8_ERR8;
				204	}
				205	if (c == 0xf0 && (d & 0x30) == 0)
				206	{
				207	*erroroffset = (int)(p - string) - 3;
				208	return PCRE_UTF8_ERR17;
				209	}
				210	if (c > 0xf4 \|\| (c == 0xf4 && d > 0x8f))
				211	{
				212	*erroroffset = (int)(p - string) - 3;
				213	return PCRE_UTF8_ERR13;
				214	}
				215	break;
				216
				217	/* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be
				218	rejected by the length test below. However, we do the appropriate tests
				219	here so that overlong sequences get diagnosed, and also in case there is
				220	ever an option for handling these larger code points. */
				221
				222	/* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for
				223	1111 1000, xx00 0xxx */
				224
				225	case 4:
				226	if (((++p) & 0xc0) != 0x80) / Third byte */
				227	{
				228	*erroroffset = (int)(p - string) - 2;
				229	return PCRE_UTF8_ERR7;
				230	}
				231	if (((++p) & 0xc0) != 0x80) / Fourth byte */
				232	{
				233	*erroroffset = (int)(p - string) - 3;
				234	return PCRE_UTF8_ERR8;
				235	}
				236	if (((++p) & 0xc0) != 0x80) / Fifth byte */
				237	{
				238	*erroroffset = (int)(p - string) - 4;
				239	return PCRE_UTF8_ERR9;
				240	}
				241	if (c == 0xf8 && (d & 0x38) == 0)
				242	{
				243	*erroroffset = (int)(p - string) - 4;
				244	return PCRE_UTF8_ERR18;
				245	}
				246	break;
				247
				248	/* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for
				249	1111 1100, xx00 00xx. */
				250
				251	case 5:
				252	if (((++p) & 0xc0) != 0x80) / Third byte */
				253	{
				254	*erroroffset = (int)(p - string) - 2;
				255	return PCRE_UTF8_ERR7;
				256	}
				257	if (((++p) & 0xc0) != 0x80) / Fourth byte */
				258	{
				259	*erroroffset = (int)(p - string) - 3;
				260	return PCRE_UTF8_ERR8;
				261	}
				262	if (((++p) & 0xc0) != 0x80) / Fifth byte */
				263	{
				264	*erroroffset = (int)(p - string) - 4;
				265	return PCRE_UTF8_ERR9;
				266	}
				267	if (((++p) & 0xc0) != 0x80) / Sixth byte */
				268	{
				269	*erroroffset = (int)(p - string) - 5;
				270	return PCRE_UTF8_ERR10;
				271	}
				272	if (c == 0xfc && (d & 0x3c) == 0)
				273	{
				274	*erroroffset = (int)(p - string) - 5;
				275	return PCRE_UTF8_ERR19;
				276	}
				277	break;
				278	}
				279
				280	/* Character is valid under RFC 2279, but 4-byte and 5-byte characters are
				281	excluded by RFC 3629. The pointer p is currently at the last byte of the
				282	character. */
				283
				284	if (ab > 3)
				285	{
				286	*erroroffset = (int)(p - string) - ab;
				287	return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12;
				288	}
				289	}
				290
				291	#else /* SUPPORT_UTF8 */
				292	(void)(string); /* Keep picky compilers happy */
				293	(void)(length);
				294	#endif
				295
				296	return PCRE_UTF8_ERR0; /* This indicates success */
				297	}
				298
				299	/* End of pcre_valid_utf8.c */