Simplified the horrible mess the UTF-8 encoding function was
This commit is contained in:
parent
5c2f6c62ec
commit
3ce2754211
77
src/main.cc
77
src/main.cc
@ -128,54 +128,37 @@ const uvec compress(const ustring &text, dic_t &dictionary) {
|
|||||||
* \param out Fichier de sortie
|
* \param out Fichier de sortie
|
||||||
* \param c Caractères à écrire dans \p out
|
* \param c Caractères à écrire dans \p out
|
||||||
*/
|
*/
|
||||||
void write_char(FILE *out, uint32_t c) {
|
void write_utf8(FILE* out, uint32_t c) {
|
||||||
if (c < 128) {
|
if(c < 128) {
|
||||||
fwrite(&c, sizeof(unsigned char), 1, out);
|
fwrite(&c, sizeof(unsigned char), 1, out);
|
||||||
} else if (c < 2048) {
|
return;
|
||||||
auto temp = new unsigned char[2];
|
|
||||||
temp[0] = static_cast<unsigned char>((c >> 6) + 0xC0); // 110xxxxx
|
|
||||||
temp[1] = static_cast<unsigned char>((c & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
fwrite(temp, sizeof(unsigned char), 2, out);
|
|
||||||
delete[] temp;
|
|
||||||
} else if (c < 65536) {
|
|
||||||
auto temp = new unsigned char[3];
|
|
||||||
temp[0] = static_cast<unsigned char>((c >> 12) + 0xE0); // 1110xxxx
|
|
||||||
temp[1] = static_cast<unsigned char>(((c >> 6) & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
temp[2] = static_cast<unsigned char>((c & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
fwrite(temp, sizeof(unsigned char), 3, out);
|
|
||||||
delete[] temp;
|
|
||||||
} else if (c < 2097152) {
|
|
||||||
auto temp = new unsigned char[4];
|
|
||||||
temp[0] = static_cast<unsigned char>((c >> 18) + 0xF0); // 11110xxx
|
|
||||||
temp[1] = static_cast<unsigned char>(((c >> 12) & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
temp[2] = static_cast<unsigned char>(((c >> 6) & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
temp[3] = static_cast<unsigned char>((c & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
fwrite(temp, sizeof(unsigned char), 4, out);
|
|
||||||
delete[] temp;
|
|
||||||
} else if (c < 67108864) {
|
|
||||||
auto temp = new unsigned char[5];
|
|
||||||
temp[0] = static_cast<unsigned char>((c >> 24) + 0xF8); // 111110xx
|
|
||||||
temp[1] = static_cast<unsigned char>(((c >> 18) & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
temp[2] = static_cast<unsigned char>(((c >> 12) & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
temp[3] = static_cast<unsigned char>(((c >> 6) & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
temp[4] = static_cast<unsigned char>((c & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
fwrite(temp, sizeof(unsigned char), 5, out);
|
|
||||||
delete[] temp;
|
|
||||||
} else if (c < 2147483648) {
|
|
||||||
auto temp = new unsigned char[6];
|
|
||||||
temp[0] = static_cast<unsigned char>((c >> 30) + 0xFC); // 1111110x
|
|
||||||
temp[1] = static_cast<unsigned char>(((c >> 24) & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
temp[2] = static_cast<unsigned char>(((c >> 18) & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
temp[3] = static_cast<unsigned char>(((c >> 12) & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
temp[4] = static_cast<unsigned char>(((c >> 6) & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
temp[5] = static_cast<unsigned char>((c & 0x3F) + 0x80); // 10xxxxxx
|
|
||||||
fwrite(temp, sizeof(unsigned char), 6, out);
|
|
||||||
delete[] temp;
|
|
||||||
} else {
|
|
||||||
// erreur, nombre trop grand
|
|
||||||
perror("Character value too high, must fit in 31bits");
|
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
|
size_t loops = 0;
|
||||||
|
unsigned char header = 0;
|
||||||
|
if (c < 2048) {
|
||||||
|
loops = 1;
|
||||||
|
header = 0xC0;
|
||||||
|
} else if (c < 65536) {
|
||||||
|
loops = 2;
|
||||||
|
header = 0xE0;
|
||||||
|
} else if (c < 2097152) {
|
||||||
|
loops = 3;
|
||||||
|
header = 0xF0;
|
||||||
|
} else if (c < 67108864) {
|
||||||
|
loops = 4;
|
||||||
|
header = 0xF8;
|
||||||
|
} else {
|
||||||
|
loops = 5;
|
||||||
|
header = 0xFC;
|
||||||
|
}
|
||||||
|
|
||||||
|
ustring str(loops + 1, 0);
|
||||||
|
for (size_t i = 0; i <= loops; ++i) {
|
||||||
|
str[i] = static_cast<unsigned char>(
|
||||||
|
((c & (i == loops) ? 0x3F : 0xFF) >> ((loops - i) * 6)) +
|
||||||
|
((i == 0) ? header : 0x80));
|
||||||
|
}
|
||||||
|
fwrite(str.data(), sizeof(unsigned char), str.size(), out);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
@ -230,7 +213,7 @@ int main(int argc, char *argv[]) {
|
|||||||
printf("Number of custom words in the dictionary: %zu\n", dictionary.size());
|
printf("Number of custom words in the dictionary: %zu\n", dictionary.size());
|
||||||
|
|
||||||
for(const auto c : comp_str)
|
for(const auto c : comp_str)
|
||||||
write_char(out, c);
|
write_utf8(out, c);
|
||||||
|
|
||||||
fclose(out);
|
fclose(out);
|
||||||
t.close();
|
t.close();
|
||||||
|
Loading…
Reference in New Issue
Block a user