Commit 3960e4d5 authored by louiz’'s avatar louiz’

Functions to provide xml-valid strings

By removing invalid chars, see http://www.w3.org/TR/xml/#charsets
parent b29290f7
......@@ -52,6 +52,11 @@ int main()
assert(from_ascii == "couc�ou");
std::cout << from_ascii << std::endl;
std::string without_ctrl_char("𤭢€¢$");
assert(utils::remove_invalid_xml_chars(without_ctrl_char) == without_ctrl_char);
assert(utils::remove_invalid_xml_chars(in) == in);
assert(utils::remove_invalid_xml_chars("\acouco\u0008u\uFFFEt\uFFFFe\r\n♥") == "coucoute\r\n♥");
/**
* Utils
*/
......@@ -156,7 +161,7 @@ int main()
/**
* Config
*/
std::cout << color << "Testing JID parsing…" << reset << std::endl;
std::cout << color << "Testing config…" << reset << std::endl;
Config::filename = "test.cfg";
Config::file_must_exist = false;
Config::set("coucou", "bonjour");
......
......@@ -9,6 +9,8 @@
#include <config.h>
#include <bitset>
/**
* The UTF-8-encoded character used as a place holder when a character conversion fails.
* This is U+FFFD � "replacement character"
......@@ -66,6 +68,77 @@ namespace utils
return true;
}
std::string remove_invalid_xml_chars(const std::string& original)
{
// The given string MUST be a valid utf-8 string
unsigned char* res = new unsigned char[original.size()];
ScopeGuard sg([&res]() { delete[] res;});
// pointer where we write valid chars
unsigned char* r = res;
const unsigned char* str = reinterpret_cast<const unsigned char*>(original.c_str());
std::bitset<20> codepoint;
while (*str)
{
// 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if ((str[0] & 11111000_b) == 11110000_b)
{
codepoint = ((str[0] & 00000111_b) << 18);
codepoint |= ((str[1] & 00111111_b) << 12);
codepoint |= ((str[2] & 00111111_b) << 6 );
codepoint |= ((str[3] & 00111111_b) << 0 );
if (codepoint.to_ulong() <= 0x10FFFF)
{
::memcpy(r, str, 4);
r += 4;
}
str += 4;
}
// 3 bytes: 1110xxx 10xxxxxx 10xxxxxx
else if ((str[0] & 11110000_b) == 11100000_b)
{
codepoint = ((str[0] & 00001111_b) << 12);
codepoint |= ((str[1] & 00111111_b) << 6);
codepoint |= ((str[2] & 00111111_b) << 0 );
if (codepoint.to_ulong() <= 0xD7FF ||
(codepoint.to_ulong() >= 0xE000 && codepoint.to_ulong() <= 0xFFFD))
{
::memcpy(r, str, 3);
r += 3;
}
str += 3;
}
// 2 bytes: 110xxxxx 10xxxxxx
else if (((str[0]) & 11100000_b) == 11000000_b)
{
// All 2 bytes char are valid, don't even bother calculating
// the codepoint
::memcpy(r, str, 2);
r += 2;
str += 2;
}
// 1 byte: 0xxxxxxx
else if ((str[0] & 10000000_b) == 0)
{
codepoint = ((str[0] & 01111111_b));
if (codepoint.to_ulong() == 0x09 ||
codepoint.to_ulong() == 0x0A ||
codepoint.to_ulong() == 0x0D ||
codepoint.to_ulong() >= 0x20)
{
::memcpy(r, str, 1);
r += 1;
}
str += 1;
}
else
throw std::runtime_error("Invalid UTF-8 passed to remove_invalid_xml_chars");
}
return std::string(reinterpret_cast<char*>(res), r-res);
}
std::string convert_to_utf8(const std::string& str, const char* charset)
{
std::string res;
......
......@@ -11,6 +11,14 @@ namespace utils
* Based on http://en.wikipedia.org/wiki/UTF-8#Description
*/
bool is_valid_utf8(const char* s);
/**
* Remove all invalid codepoints from the given utf-8-encoded string.
* The value returned is a copy of the string, without the removed chars.
*
* See http://www.w3.org/TR/xml/#charsets for the list of valid characters
* in XML.
*/
std::string remove_invalid_xml_chars(const std::string& original);
/**
* Convert the given string (encoded is "encoding") into valid utf-8.
* If some decoding fails, insert an utf-8 placeholder character instead.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment