Simplified the horrible mess the UTF-8 encoding function was

This commit is contained in:
Phuntsok Drak-pa 2018-04-03 16:11:54 +02:00
parent 5c2f6c62ec
commit 3ce2754211

View File

@ -128,54 +128,37 @@ const uvec compress(const ustring &text, dic_t &dictionary) {
* \param out Fichier de sortie * \param out Fichier de sortie
* \param c Caractères à écrire dans \p out * \param c Caractères à écrire dans \p out
*/ */
void write_char(FILE *out, uint32_t c) { void write_utf8(FILE* out, uint32_t c) {
if(c < 128) { if(c < 128) {
fwrite(&c, sizeof(unsigned char), 1, out); fwrite(&c, sizeof(unsigned char), 1, out);
} else if (c < 2048) { return;
auto temp = new unsigned char[2];
temp[0] = static_cast<unsigned char>((c >> 6) + 0xC0); // 110xxxxx
temp[1] = static_cast<unsigned char>((c & 0x3F) + 0x80); // 10xxxxxx
fwrite(temp, sizeof(unsigned char), 2, out);
delete[] temp;
} else if (c < 65536) {
auto temp = new unsigned char[3];
temp[0] = static_cast<unsigned char>((c >> 12) + 0xE0); // 1110xxxx
temp[1] = static_cast<unsigned char>(((c >> 6) & 0x3F) + 0x80); // 10xxxxxx
temp[2] = static_cast<unsigned char>((c & 0x3F) + 0x80); // 10xxxxxx
fwrite(temp, sizeof(unsigned char), 3, out);
delete[] temp;
} else if (c < 2097152) {
auto temp = new unsigned char[4];
temp[0] = static_cast<unsigned char>((c >> 18) + 0xF0); // 11110xxx
temp[1] = static_cast<unsigned char>(((c >> 12) & 0x3F) + 0x80); // 10xxxxxx
temp[2] = static_cast<unsigned char>(((c >> 6) & 0x3F) + 0x80); // 10xxxxxx
temp[3] = static_cast<unsigned char>((c & 0x3F) + 0x80); // 10xxxxxx
fwrite(temp, sizeof(unsigned char), 4, out);
delete[] temp;
} else if (c < 67108864) {
auto temp = new unsigned char[5];
temp[0] = static_cast<unsigned char>((c >> 24) + 0xF8); // 111110xx
temp[1] = static_cast<unsigned char>(((c >> 18) & 0x3F) + 0x80); // 10xxxxxx
temp[2] = static_cast<unsigned char>(((c >> 12) & 0x3F) + 0x80); // 10xxxxxx
temp[3] = static_cast<unsigned char>(((c >> 6) & 0x3F) + 0x80); // 10xxxxxx
temp[4] = static_cast<unsigned char>((c & 0x3F) + 0x80); // 10xxxxxx
fwrite(temp, sizeof(unsigned char), 5, out);
delete[] temp;
} else if (c < 2147483648) {
auto temp = new unsigned char[6];
temp[0] = static_cast<unsigned char>((c >> 30) + 0xFC); // 1111110x
temp[1] = static_cast<unsigned char>(((c >> 24) & 0x3F) + 0x80); // 10xxxxxx
temp[2] = static_cast<unsigned char>(((c >> 18) & 0x3F) + 0x80); // 10xxxxxx
temp[3] = static_cast<unsigned char>(((c >> 12) & 0x3F) + 0x80); // 10xxxxxx
temp[4] = static_cast<unsigned char>(((c >> 6) & 0x3F) + 0x80); // 10xxxxxx
temp[5] = static_cast<unsigned char>((c & 0x3F) + 0x80); // 10xxxxxx
fwrite(temp, sizeof(unsigned char), 6, out);
delete[] temp;
} else {
// erreur, nombre trop grand
perror("Character value too high, must fit in 31bits");
exit(1);
} }
size_t loops = 0;
unsigned char header = 0;
if (c < 2048) {
loops = 1;
header = 0xC0;
} else if (c < 65536) {
loops = 2;
header = 0xE0;
} else if (c < 2097152) {
loops = 3;
header = 0xF0;
} else if (c < 67108864) {
loops = 4;
header = 0xF8;
} else {
loops = 5;
header = 0xFC;
}
ustring str(loops + 1, 0);
for (size_t i = 0; i <= loops; ++i) {
str[i] = static_cast<unsigned char>(
((c & (i == loops) ? 0x3F : 0xFF) >> ((loops - i) * 6)) +
((i == 0) ? header : 0x80));
}
fwrite(str.data(), sizeof(unsigned char), str.size(), out);
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
@ -230,7 +213,7 @@ int main(int argc, char *argv[]) {
printf("Number of custom words in the dictionary: %zu\n", dictionary.size()); printf("Number of custom words in the dictionary: %zu\n", dictionary.size());
for(const auto c : comp_str) for(const auto c : comp_str)
write_char(out, c); write_utf8(out, c);
fclose(out); fclose(out);
t.close(); t.close();