src/utf8_utils.cpp - jami-daemon - Gitiles

 /*
  *  Copyright (C) 1999 Tom Tromey
  *  Copyright (C) 2000 Red Hat, Inc.
  *  Copyright (C) 2004-2021 Savoir-faire Linux Inc.
  *
  *  Author: Pascal Potvin <pascal.potvin@extenway.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 3 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
  */

 #include <cstring>
 #include <cassert>
 #include "utf8_utils.h"

 #if defined(_MSC_VER)
 #include <BaseTsd.h>
 using ssize_t = SSIZE_T;
 #endif

 /*
  * The LIKELY and UNLIKELY macros let the programmer give hints to
  * the compiler about the expected result of an expression. Some compilers
  * can use this information for optimizations.
  */
 #if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
 #define LIKELY(expr)   (__builtin_expect(expr, 1))
 #define UNLIKELY(expr) (__builtin_expect(expr, 0))
 #else
 #define LIKELY(expr)   (expr)
 #define UNLIKELY(expr) (expr)
 #endif

 /*
  * Check whether a Unicode (5.2) char is in a valid range.
  *
  * The first check comes from the Unicode guarantee to never encode
  * a point above 0x0010ffff, since UTF-16 couldn't represent it.
  *
  * The second check covers surrogate pairs (category Cs).
  *
  * @param Char the character
  */
 #define UNICODE_VALID(Char) ((Char) < 0x110000 && (((Char) &0xFFFFF800) != 0xD800))

 #define CONTINUATION_CHAR \
     if ((*(unsigned char*) p & 0xc0) != 0x80) /* 10xxxxxx */ \
         goto error; \
     val <<= 6; \
     val |= (*(unsigned char*) p) & 0x3f;

 namespace jami {

 bool utf8_validate_c_str(const char* str, ssize_t max_len, const char** end);

 static const char*
 fast_validate(const char* str)
 {
     char32_t val = 0;
     char32_t min = 0;
     const char* p;

     for (p = str; *p; p++) {
         if (*(unsigned char*) p < 128)
             /* done */;
         else {
             const char* last;

             last = p;

             if ((*(unsigned char*) p & 0xe0) == 0xc0) { /* 110xxxxx */
                 if (UNLIKELY((*(unsigned char*) p & 0x1e) == 0))
                     goto error;

                 p++;

                 if (UNLIKELY((*(unsigned char*) p & 0xc0) != 0x80)) /* 10xxxxxx */
                     goto error;
             } else {
                 if ((*(unsigned char*) p & 0xf0) == 0xe0) { /* 1110xxxx */
                     min = (1 << 11);
                     val = *(unsigned char*) p & 0x0f;
                     goto TWO_REMAINING;
                 } else if ((*(unsigned char*) p & 0xf8) == 0xf0) { /* 11110xxx */
                     min = (1 << 16);
                     val = *(unsigned char*) p & 0x07;
                 } else
                     goto error;

                 p++;
                 CONTINUATION_CHAR;
             TWO_REMAINING:
                 p++;
                 CONTINUATION_CHAR;
                 p++;
                 CONTINUATION_CHAR;

                 if (UNLIKELY(val < min))
                     goto error;

                 if (UNLIKELY(!UNICODE_VALID(val)))
                     goto error;
             }

             continue;

         error:
             return last;
         }
     }

     return p;
 }

 static const char*
 fast_validate_len(const char* str, ssize_t max_len)
 {
     char32_t val = 0;
     char32_t min = 0;
     const char* p;

     assert(max_len >= 0);

     for (p = str; ((p - str) < max_len) && *p; p++) {
         if (*(unsigned char*) p < 128)
             /* done */;
         else {
             const char* last;

             last = p;

             if ((*(unsigned char*) p & 0xe0) == 0xc0) { /* 110xxxxx */
                 if (UNLIKELY(max_len - (p - str) < 2))
                     goto error;

                 if (UNLIKELY((*(unsigned char*) p & 0x1e) == 0))
                     goto error;

                 p++;

                 if (UNLIKELY((*(unsigned char*) p & 0xc0) != 0x80)) /* 10xxxxxx */
                     goto error;
             } else {
                 if ((*(unsigned char*) p & 0xf0) == 0xe0) { /* 1110xxxx */
                     if (UNLIKELY(max_len - (p - str) < 3))
                         goto error;

                     min = (1 << 11);
                     val = *(unsigned char*) p & 0x0f;
                     goto TWO_REMAINING;
                 } else if ((*(unsigned char*) p & 0xf8) == 0xf0) { /* 11110xxx */
                     if (UNLIKELY(max_len - (p - str) < 4))
                         goto error;

                     min = (1 << 16);
                     val = *(unsigned char*) p & 0x07;
                 } else
                     goto error;

                 p++;
                 CONTINUATION_CHAR;
             TWO_REMAINING:
                 p++;
                 CONTINUATION_CHAR;
                 p++;
                 CONTINUATION_CHAR;

                 if (UNLIKELY(val < min))
                     goto error;

                 if (UNLIKELY(!UNICODE_VALID(val)))
                     goto error;
             }

             continue;

         error:
             return last;
         }
     }

     return p;
 }

 /**
  * utf8_validate_c_str:
  * @str: a pointer to character data
  * @max_len: max bytes to validate, or -1 to go until NULL
  * @end: return location for end of valid data
  *
  * Validates UTF-8 encoded text. @str is the text to validate;
  * if @str is nul-terminated, then @max_len can be -1, otherwise
  * @max_len should be the number of bytes to validate.
  * If @end is non-%NULL, then the end of the valid range
  * will be stored there (i.e. the start of the first invalid
  * character if some bytes were invalid, or the end of the text
  * being validated otherwise).
  *
  * Note that utf8_validate() returns %false if @max_len is
  * positive and any of the @max_len bytes are nul.
  *
  * Returns true if all of @str was valid. Dbus requires valid UTF-8 as input;
  * sip packets should also be encoded in utf8; so data read from a file or the
  * network should be checked with utf8_validate() before doing anything else
  * with it.
  *
  * Returns: true if the text was valid UTF-8
  */
 bool
 utf8_validate_c_str(const char* str, ssize_t max_len, const char** end)
 {
     const char* p;

     if (max_len < 0)
         p = fast_validate(str);
     else
         p = fast_validate_len(str, max_len);

     if (end)
         *end = p;

     if ((max_len >= 0 && p != str + max_len) || (max_len < 0 && *p != '\0'))
         return false;
     else
         return true;
 }

 bool
 utf8_validate(const std::string& str)
 {
     const char* p;

     p = fast_validate(str.c_str());

     return (*p == '\0');
 }

 std::string
 utf8_make_valid(const std::string& name)
 {
     ssize_t remaining_bytes = name.size();
     ssize_t valid_bytes;
     const char* remainder = name.c_str();
     const char* invalid;
     char* str = NULL;
     char* pos;

     while (remaining_bytes != 0) {
         if (utf8_validate_c_str(remainder, remaining_bytes, &invalid))
             break;

         valid_bytes = invalid - remainder;

         if (str == NULL)
             // If every byte is replaced by U+FFFD, max(strlen(string)) == 3 * name.size()
             str = new char[3 * remaining_bytes];

         pos = str;

         strncpy(pos, remainder, valid_bytes);
         pos += valid_bytes;

         /* append U+FFFD REPLACEMENT CHARACTER */
         pos[0] = '\357';
         pos[1] = '\277';
         pos[2] = '\275';

         pos += 3;

         remaining_bytes -= valid_bytes + 1;
         remainder = invalid + 1;
     }

     if (str == NULL)
         return std::string(name);

     strncpy(pos, remainder, remaining_bytes);
     pos += remaining_bytes;

     std::string answer(str, pos - str);
     assert(utf8_validate_c_str(answer.c_str(), -1, NULL));

     delete[] str;

     return answer;
 }

 } // namespace jami
	/*
	* Copyright (C) 1999 Tom Tromey
	* Copyright (C) 2000 Red Hat, Inc.
	* Copyright (C) 2004-2021 Savoir-faire Linux Inc.
	*
	* Author: Pascal Potvin <pascal.potvin@extenway.com>
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 3 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	*/

	#include <cstring>
	#include <cassert>
	#include "utf8_utils.h"

	#if defined(_MSC_VER)
	#include <BaseTsd.h>
	using ssize_t = SSIZE_T;
	#endif

	/*
	* The LIKELY and UNLIKELY macros let the programmer give hints to
	* the compiler about the expected result of an expression. Some compilers
	* can use this information for optimizations.
	*/
	#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
	#define LIKELY(expr) (__builtin_expect(expr, 1))
	#define UNLIKELY(expr) (__builtin_expect(expr, 0))
	#else
	#define LIKELY(expr) (expr)
	#define UNLIKELY(expr) (expr)
	#endif

	/*
	* Check whether a Unicode (5.2) char is in a valid range.
	*
	* The first check comes from the Unicode guarantee to never encode
	* a point above 0x0010ffff, since UTF-16 couldn't represent it.
	*
	* The second check covers surrogate pairs (category Cs).
	*
	* @param Char the character
	*/
	#define UNICODE_VALID(Char) ((Char) < 0x110000 && (((Char) &0xFFFFF800) != 0xD800))

	#define CONTINUATION_CHAR \
	if (((unsigned char) p & 0xc0) != 0x80) /* 10xxxxxx */ \
	goto error; \
	val <<= 6; \
	val \|= ((unsigned char) p) & 0x3f;

	namespace jami {

	bool utf8_validate_c_str(const char* str, ssize_t max_len, const char** end);

	static const char*
	fast_validate(const char* str)
	{
	char32_t val = 0;
	char32_t min = 0;
	const char* p;

	for (p = str; *p; p++) {
	if ((unsigned char) p < 128)
	/* done */;
	else {
	const char* last;

	last = p;

	if (((unsigned char) p & 0xe0) == 0xc0) { /* 110xxxxx */
	if (UNLIKELY(((unsigned char) p & 0x1e) == 0))
	goto error;

	p++;

	if (UNLIKELY(((unsigned char) p & 0xc0) != 0x80)) /* 10xxxxxx */
	goto error;
	} else {
	if (((unsigned char) p & 0xf0) == 0xe0) { /* 1110xxxx */
	min = (1 << 11);
	val = (unsigned char) p & 0x0f;
	goto TWO_REMAINING;
	} else if (((unsigned char) p & 0xf8) == 0xf0) { /* 11110xxx */
	min = (1 << 16);
	val = (unsigned char) p & 0x07;
	} else
	goto error;

	p++;
	CONTINUATION_CHAR;
	TWO_REMAINING:
	p++;
	CONTINUATION_CHAR;
	p++;
	CONTINUATION_CHAR;

	if (UNLIKELY(val < min))
	goto error;

	if (UNLIKELY(!UNICODE_VALID(val)))
	goto error;
	}

	continue;

	error:
	return last;
	}
	}

	return p;
	}

	static const char*
	fast_validate_len(const char* str, ssize_t max_len)
	{
	char32_t val = 0;
	char32_t min = 0;
	const char* p;

	assert(max_len >= 0);

	for (p = str; ((p - str) < max_len) && *p; p++) {
	if ((unsigned char) p < 128)
	/* done */;
	else {
	const char* last;

	last = p;

	if (((unsigned char) p & 0xe0) == 0xc0) { /* 110xxxxx */
	if (UNLIKELY(max_len - (p - str) < 2))
	goto error;

	if (UNLIKELY(((unsigned char) p & 0x1e) == 0))
	goto error;

	p++;

	if (UNLIKELY(((unsigned char) p & 0xc0) != 0x80)) /* 10xxxxxx */
	goto error;
	} else {
	if (((unsigned char) p & 0xf0) == 0xe0) { /* 1110xxxx */
	if (UNLIKELY(max_len - (p - str) < 3))
	goto error;

	min = (1 << 11);
	val = (unsigned char) p & 0x0f;
	goto TWO_REMAINING;
	} else if (((unsigned char) p & 0xf8) == 0xf0) { /* 11110xxx */
	if (UNLIKELY(max_len - (p - str) < 4))
	goto error;

	min = (1 << 16);
	val = (unsigned char) p & 0x07;
	} else
	goto error;

	p++;
	CONTINUATION_CHAR;
	TWO_REMAINING:
	p++;
	CONTINUATION_CHAR;
	p++;
	CONTINUATION_CHAR;

	if (UNLIKELY(val < min))
	goto error;

	if (UNLIKELY(!UNICODE_VALID(val)))
	goto error;
	}

	continue;

	error:
	return last;
	}
	}

	return p;
	}

	/**
	* utf8_validate_c_str:
	* @str: a pointer to character data
	* @max_len: max bytes to validate, or -1 to go until NULL
	* @end: return location for end of valid data
	*
	* Validates UTF-8 encoded text. @str is the text to validate;
	* if @str is nul-terminated, then @max_len can be -1, otherwise
	* @max_len should be the number of bytes to validate.
	* If @end is non-%NULL, then the end of the valid range
	* will be stored there (i.e. the start of the first invalid
	* character if some bytes were invalid, or the end of the text
	* being validated otherwise).
	*
	* Note that utf8_validate() returns %false if @max_len is
	* positive and any of the @max_len bytes are nul.
	*
	* Returns true if all of @str was valid. Dbus requires valid UTF-8 as input;
	* sip packets should also be encoded in utf8; so data read from a file or the
	* network should be checked with utf8_validate() before doing anything else
	* with it.
	*
	* Returns: true if the text was valid UTF-8
	*/
	bool
	utf8_validate_c_str(const char* str, ssize_t max_len, const char** end)
	{
	const char* p;

	if (max_len < 0)
	p = fast_validate(str);
	else
	p = fast_validate_len(str, max_len);

	if (end)
	*end = p;

	if ((max_len >= 0 && p != str + max_len) \|\| (max_len < 0 && *p != '\0'))
	return false;
	else
	return true;
	}

	bool
	utf8_validate(const std::string& str)
	{
	const char* p;

	p = fast_validate(str.c_str());

	return (*p == '\0');
	}

	std::string
	utf8_make_valid(const std::string& name)
	{
	ssize_t remaining_bytes = name.size();
	ssize_t valid_bytes;
	const char* remainder = name.c_str();
	const char* invalid;
	char* str = NULL;
	char* pos;

	while (remaining_bytes != 0) {
	if (utf8_validate_c_str(remainder, remaining_bytes, &invalid))
	break;

	valid_bytes = invalid - remainder;

	if (str == NULL)
	// If every byte is replaced by U+FFFD, max(strlen(string)) == 3 * name.size()
	str = new char[3 * remaining_bytes];

	pos = str;

	strncpy(pos, remainder, valid_bytes);
	pos += valid_bytes;

	/* append U+FFFD REPLACEMENT CHARACTER */
	pos[0] = '\357';
	pos[1] = '\277';
	pos[2] = '\275';

	pos += 3;

	remaining_bytes -= valid_bytes + 1;
	remainder = invalid + 1;
	}

	if (str == NULL)
	return std::string(name);

	strncpy(pos, remainder, remaining_bytes);
	pos += remaining_bytes;

	std::string answer(str, pos - str);
	assert(utf8_validate_c_str(answer.c_str(), -1, NULL));

	delete[] str;

	return answer;
	}

	} // namespace jami