Commit ccebe901 authored by louiz’'s avatar louiz’

Check UTF-8 encoding, and convert strings to UTF-8

Handle conversion errors properly by inserting � instead.  Add a binary
header to provide portable way to write binary literals (I like them) Also
add a test file.  ref #2404
parent a418b6ed
......@@ -19,6 +19,13 @@ include_directories("src/")
# coming from these headers.
include_directories(SYSTEM ${CRYPTO++_INCLUDE_DIR})
#
## utils
#
file(GLOB source_utils
src/utils/*.[hc]pp)
add_library(utils STATIC ${source_utils})
#
## network
#
......@@ -32,7 +39,7 @@ add_library(network STATIC ${source_network})
file(GLOB source_irc
src/irc/*.[hc]pp)
add_library(irc STATIC ${source_irc})
target_link_libraries(irc network)
target_link_libraries(irc network utils)
#
## xmpplib
......@@ -40,7 +47,7 @@ target_link_libraries(irc network)
file(GLOB source_xmpp
src/xmpp/*.[hc]pp)
add_library(xmpp STATIC ${source_xmpp})
target_link_libraries(xmpp bridge network ${CRYPTO++_LIBRARIES} expatpp)
target_link_libraries(xmpp bridge network utils ${CRYPTO++_LIBRARIES} expatpp)
#
## bridge
......@@ -54,4 +61,15 @@ add_executable(${PROJECT_NAME} src/main.cpp)
target_link_libraries(${PROJECT_NAME}
xmpp
irc
bridge)
\ No newline at end of file
bridge)
#
## Tests
#
add_executable(test src/test.cpp)
target_link_libraries(test
xmpp
irc
bridge
utils)
#include <irc/irc_client.hpp>
#include <xmpp/xmpp_component.hpp>
#include <network/poller.hpp>
#include <xmpp/xmpp_parser.hpp>
#include <xmpp/xmpp_stanza.hpp>
#include <xmpp/xmpp_component.hpp>
#include <memory>
#include <xmpp/jid.hpp>
#include <irc/iid.hpp>
#include <iostream>
int main()
{
Poller p;
// Now I'm the bridge, creating an ircclient because needed.
std::shared_ptr<IrcClient> c = std::make_shared<IrcClient>();
p.add_socket_handler(c);
std::shared_ptr<IrcClient> d = std::make_shared<IrcClient>();
p.add_socket_handler(d);
std::shared_ptr<IrcClient> e = std::make_shared<IrcClient>();
p.add_socket_handler(e);
c->connect("localhost", "7877");
d->connect("localhost", "7878");
e->connect("localhost", "7879");
while (true)
p.poll();
std::shared_ptr<XmppComponent> xmpp_component =
std::make_shared<XmppComponent>("irc.localhost", "secret");
p.add_socket_handler(xmpp_component);
xmpp_component->start();
while (p.poll())
;
return 0;
}
/**
* Just a very simple test suite, by hand, using assert()
*/
#include <assert.h>
#include <iostream>
#include <utils/encoding.hpp>
#include <string.h>
#include <fstream>
int main()
{
/**
* Encoding
*/
const char* valid = "C̡͔͕̩͙̽ͫ̈́ͥ̿̆ͧ̚r̸̩̘͍̻͖̆͆͛͊̉̕͡o͇͈̳̤̱̊̈͢q̻͍̦̮͕ͥͬͬ̽ͭ͌̾ͅǔ͉͕͇͚̙͉̭͉̇̽ȇ͈̮̼͍͔ͣ͊͞͝ͅ ͫ̾ͪ̓ͥ̆̋̔҉̢̦̠͈͔̖̲̯̦ụ̶̯͐̃̋ͮ͆͝n̬̱̭͇̻̱̰̖̤̏͛̏̿̑͟ë́͐҉̸̥̪͕̹̻̙͉̰ ̹̼̱̦̥ͩ͑̈́͑͝ͅt͍̥͈̹̝ͣ̃̔̈̔ͧ̕͝ḙ̸̖̟̙͙ͪ͢ų̯̞̼̲͓̻̞͛̃̀́b̮̰̗̩̰̊̆͗̾̎̆ͯ͌͝.̗̙͎̦ͫ̈́ͥ͌̈̓ͬ";
assert(utils::is_valid_utf8(valid) == true);
const char* invalid = "\xF0\x0F";
assert(utils::is_valid_utf8(invalid) == false);
const char* invalid2 = "\xFE\xFE\xFF\xFF";
assert(utils::is_valid_utf8(invalid2) == false);
std::string in = "coucou les copains ♥ ";
assert(utils::is_valid_utf8(in.c_str()) == true);
std::string res = utils::convert_to_utf8(in, "UTF-8");
assert(utils::is_valid_utf8(res.c_str()) == true && res == in);
std::string original_utf8("couc¥ou");
std::string original_latin1("couc\xa5ou");
// When converting back to utf-8
std::string from_latin1 = utils::convert_to_utf8(original_latin1.c_str(), "ISO-8859-1");
assert(from_latin1 == original_utf8);
// Check the behaviour when the decoding fails (here because we provide a
// wrong charset)
std::string from_ascii = utils::convert_to_utf8(original_latin1, "US-ASCII");
assert(from_ascii == "couc�ou");
return 0;
}
#ifndef BINARY_INCLUDED
# define BINARY_INCLUDED
template<char FIRST, char... REST> struct binary
{
static_assert(FIRST == '0' || FIRST == '1', "invalid binary digit" );
enum { value = ((FIRST - '0') << sizeof...(REST)) + binary<REST...>::value };
};
template<> struct binary<'0'> { enum { value = 0 }; };
template<> struct binary<'1'> { enum { value = 1 }; };
template<char... LITERAL> inline
constexpr unsigned int operator "" _b() { return binary<LITERAL...>::value; }
#endif // BINARY_INCLUDED
#include <utils/encoding.hpp>
#include <utils/binary.hpp>
#include <utils/scopeguard.hpp>
#include <assert.h>
#include <string.h>
#include <iconv.h>
/**
* The UTF-8-encoded character used as a place holder when a character conversion fails.
* This is U+FFFD � "replacement character"
*/
static const char* invalid_char = "\xef\xbf\xbd";
static const size_t invalid_char_len = 3;
namespace utils
{
/**
* Based on http://en.wikipedia.org/wiki/UTF-8#Description
*/
bool is_valid_utf8(const char* s)
{
if (!s)
return false;
const unsigned char* str = reinterpret_cast<const unsigned char*>(s);
while (*str)
{
// 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if ((str[0] & 11111000_b) == 11110000_b)
{
if (!str[1] || !str[2] || !str[3]
|| ((str[1] & 11000000_b) != 10000000_b)
|| ((str[2] & 11000000_b) != 10000000_b)
|| ((str[3] & 11000000_b) != 10000000_b))
return false;
str += 4;
}
// 3 bytes: 1110xxx 10xxxxxx 10xxxxxx
else if ((str[0] & 11110000_b) == 11100000_b)
{
if (!str[1] || !str[2]
|| ((str[1] & 11000000_b) != 10000000_b)
|| ((str[2] & 11000000_b) != 10000000_b))
return false;
str += 3;
}
// 2 bytes: 110xxxxx 10xxxxxx
else if (((str[0]) & 11100000_b) == 11000000_b)
{
if (!str[1] ||
((str[1] & 11000000_b) != 10000000_b))
return false;
str += 2;
}
// 1 byte: 0xxxxxxx
else if ((str[0] & 10000000_b) != 0)
return false;
else
str++;
}
return true;
}
std::string convert_to_utf8(const std::string& str, const char* charset)
{
std::string res;
const iconv_t cd = iconv_open("UTF-8", charset);
if (cd == (iconv_t)-1)
throw std::runtime_error("Cannot convert into UTF-8");
// Make sure cd is always closed when we leave this function
ScopeGuard sg([&]{ iconv_close(cd); });
// iconv will not attempt to modify this buffer, but it still requires
// a char**.
size_t inbytesleft = str.size();
char* inbuf_ptr = const_cast<char*>(str.c_str());
size_t outbytesleft = str.size() * 4;
char* outbuf = new char[outbytesleft];
char* outbuf_ptr = outbuf;
// Make sure outbuf is always deleted when we leave this function
sg.add_callback([&]{ delete[] outbuf; });
bool done = false;
while (done == false)
{
size_t error = iconv(cd, &inbuf_ptr, &inbytesleft, &outbuf_ptr, &outbytesleft);
if ((size_t)-1 == error)
{
switch (errno)
{
case EILSEQ:
// Invalid byte found. Insert a placeholder instead of the
// converted character, jump one byte and continue
memcpy(outbuf_ptr, invalid_char, invalid_char_len);
outbuf_ptr += invalid_char_len;
inbytesleft--;
inbuf_ptr++;
break;
case EINVAL:
// A multibyte sequence is not terminated, but we can't
// provide any more data, so we just add a placeholder to
// indicate that the character is not properly converted,
// and we stop the conversion
memcpy(outbuf_ptr, invalid_char, invalid_char_len);
outbuf_ptr += invalid_char_len;
outbuf_ptr++;
done = true;
break;
case E2BIG:
// This should never happen
done = true;
default:
// This should happen even neverer
done = true;
break;
}
}
else
{
// The conversion finished without any error, stop converting
done = true;
}
}
// Terminate the converted buffer, and copy that buffer it into the
// string we return
*outbuf_ptr = '\0';
res = outbuf;
return res;
}
}
#ifndef ENCODING_INCLUDED
# define ENCODING_INCLUDED
#include <string>
namespace utils
{
/**
* Returns true if the given null-terminated string is valid utf-8.
*
* Based on http://en.wikipedia.org/wiki/UTF-8#Description
*/
bool is_valid_utf8(const char* s);
/**
* Convert the given string (encoded is "encoding") into valid utf-8.
* If some decoding fails, insert an utf-8 placeholder character instead.
*/
std::string convert_to_utf8(const std::string& str, const char* encoding);
}
#endif // ENCODING_INCLUDED
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment