diff --git a/src/compress.cc b/src/compress.cc index a477beb..9f36759 100644 --- a/src/compress.cc +++ b/src/compress.cc @@ -4,39 +4,39 @@ */ #include "compress.hh" -#include "utf8.hh" #include #include #include #include #include "io.hh" +using std::vector; +using std::uint8_t; +using std::uint32_t; +using std::string; +using vuint32 = vector; +using vvuint32 = vector; +using std::string; using dict_t = std::map, uint32_t>; using ustring = std::basic_string; // chaîne non encodée using uvec = std::vector; // chaîne encodée using std::printf; -constexpr size_t CHUNK_SIZE = 32768; +// constexpr size_t CHUNK_SIZE = 32768; -/** - * - * Reçoit une liste de paires std::thread/vecteurs, le premier étant le - * processus dont sa sortie est stockée dans le second. La sortie, une liste - * de caractères uint32_t, est écrite dans le fichier de sortie \p out. - * - * \param[in] t_threads - * \param[out] t_out - */ -void join_and_write( - std::vector, uvec>> &t_threads, - std::vector> &compressed_text) { - for (auto &elem : t_threads) { - (*elem.first).join(); +constexpr int ipow(int base, int exp) { + int result = 1; + for (;;) { + if (exp & 1) { + result *= base; + } + exp >>= 1; + if (exp == 0) { + break; + } + base *= base; } - for (auto &elem : t_threads) { - compressed_text.push_back(std::move(elem.second)); - } - t_threads.clear(); + return result; } /** @@ -49,26 +49,36 @@ void join_and_write( * \param[in] t_text Chaîne de caractères uint8_t représentant le fichier d'entrée * \param[out] t_res Chaîne de caractères de sortie */ -void lzw_compress(const std::vector &t_text, uvec &t_res) { - dict_t dictionary{}; +vvuint32 lzw_compress(string &&t_text) { std::puts("Compressing..."); uint32_t w = 0xFFFF; + vuint32 chunk{}; + vvuint32 res{}; + dict_t dict{}; - constexpr size_t DICT_MAX = 7936; /* 12 bits */ + constexpr size_t DICT_MAX = ipow(2, 13) - 256; /* 12 bits */ - for (const auto &c : t_text) { - if (dictionary.size() >= DICT_MAX) { - t_res.push_back(static_cast(w)); - w = static_cast(c); - } else if (const auto &[exists, pos] = - dico(dictionary, w, static_cast(c)); - exists) { + for(const auto c : t_text) { + if(dict.size() >= DICT_MAX) { + // Dictionary full -> chunk pushed, dict emptied + res.push_back(std::move(chunk)); + chunk = vuint32{}; + dict = dict_t{}; + w = 0xFFFF; + } + if (const auto &[yes, pos] = dico(dict, w, static_cast(c)); yes) { w = pos; } else { - t_res.push_back(static_cast(w)); - w = static_cast(c); + chunk.push_back(static_cast(w)); + w = static_cast(c); } } + if(w != 0xFFFF) { + chunk.push_back(w); + res.push_back(std::move(chunk)); + } + + return res; } /** @@ -91,8 +101,10 @@ void compress(const std::string &t_in_file, const char *t_out_file) { } // Fichier de sortie - FILE *out = + FILE *const out = (t_out_file != nullptr) ? fopen(t_out_file, "wb") : fopen("output.lzw", "wb"); + // std::ofstream out{(t_out_file != nullptr) ? t_out_file : "output.lzw", + // std::ios::binary}; if (out == nullptr) { std::cerr << "Error at " << __FILE__ << ":" << __LINE__ - 4 << ": could not open output file. Aborting...\n"; @@ -100,47 +112,9 @@ void compress(const std::string &t_in_file, const char *t_out_file) { exit(1); } - // collection of chunks - std::vector> compressed_text{}; - - // thread pool - std::vector, uvec>> threads{}; - - // chunk chars - std::vector chunk(CHUNK_SIZE, 0); - while (input_file.read(chunk.data(), - static_cast(chunk.size()))) { - threads.emplace_back(nullptr, uvec{}); - threads.back().second.reserve(CHUNK_SIZE); - threads.back().first = std::make_unique( - std::thread{lzw_compress, chunk, ref(threads.back().second)}); - assert(threads.back().first); - if (threads.size() >= 8) { - join_and_write(threads, compressed_text); - } - } - - if (!threads.empty()) { - join_and_write(threads, compressed_text); - } - - if (input_file.tellg() != std::ios::end) { - std::puts("Leftovers, compressing..."); - { - const auto prev_pos = input_file.tellg(); - input_file.seekg(0, std::ios::end); - chunk.reserve(static_cast(input_file.tellg() - prev_pos)); - input_file.seekg(prev_pos, std::ios::beg); - std::istreambuf_iterator itr(input_file); - for (std::streamoff i = 0; i < prev_pos; ++i, ++itr){ - ; - } - chunk.assign((itr), std::istreambuf_iterator()); - } - uvec ret{}; - lzw_compress(chunk, ret); - compressed_text.push_back(std::move(ret)); - } + const auto compressed_text{ + lzw_compress(std::string{std::istreambuf_iterator(input_file), + std::istreambuf_iterator()})}; write_file(out, compressed_text); diff --git a/src/compress.hh b/src/compress.hh index 39aaf71..ff1cb26 100644 --- a/src/compress.hh +++ b/src/compress.hh @@ -10,14 +10,14 @@ #include #include #include - -/// \brief Exécution des threads et écriture de leur résultat dans le fichier de sortie -void join_and_write(std::vector, - std::vector>> &, - std::vector> &); +#include /// \brief Compression d'une chaine de caractères -void lzw_compress(const std::vector &, std::vector &); +std::vector> lzw_compress(std::string &&); + +std::optional +lzw_compress_char(std::vector> &, + std::vector &, const char); /// \brief Wrapper de \ref lzw_compress void compress(const std::string &, const char *); diff --git a/src/io.cc b/src/io.cc index 57ba9ba..1c077d3 100644 --- a/src/io.cc +++ b/src/io.cc @@ -4,13 +4,24 @@ */ #include "io.hh" +#include #ifdef Debug constexpr bool debug_mode = true; +#include #else constexpr bool debug_mode = false; #endif +using std::vector; +using std::uint32_t; +using vuint32 = vector; +using vvuint32 = vector; + + + +constexpr unsigned char char_size = 12; + /** * Écrit dans le fichier \p t_out les chunks passés en paramètre. Le fichier de * sortie est composé des éléments suivants :\n @@ -35,61 +46,42 @@ constexpr bool debug_mode = false; * \param[out] t_out Fichier de sortie * \param[in] t_text Collection ordonnée des chunks à écrire dans \p t_out */ -void write_file(FILE *t_out, std::vector> &t_text) { - { - uint32_t char_size = 12; - if constexpr (debug_mode) { - std::printf("Char size: %u\n", char_size); - } - fwrite(&char_size, sizeof(uint32_t), 1, t_out); - auto size = static_cast(t_text.size()); - if constexpr (debug_mode) { - std::printf("Number of chunks: %u\n", size); - } - fwrite(&size, sizeof(uint32_t), 1, t_out); +void write_file(FILE *const t_out, const vvuint32 &t_text) { + const auto size = static_cast(t_text.size()); + if constexpr (debug_mode) { + std::printf("Char size: %u\n", char_size); + std::printf("Number of chunks: %u\n", size); } - for(const auto &chunk : t_text) { - // write size of chunk in uint32_t - { - auto size = static_cast(chunk.size()); - if constexpr (debug_mode) { - std::printf("Size of chunk: %u\n", size); - } - fwrite(&size, sizeof(uint32_t), 1, t_out); - } - uint8_t remainder = 0x00; - for(size_t i = 0; i < chunk.size(); ++i) { - if(i % 2 == 0) { - // char = xxxx xxxx xxxx - // ^^^^^^^^^ ^^^^ - // write keep in remainder as xxxx0000 - auto temp = static_cast(chunk[i] >> 4); - fwrite(&temp, sizeof(temp), 1, t_out); - if constexpr (debug_mode) { - std::printf("writing: %x\t\t", temp); - } - remainder = static_cast(chunk[i] << 4); - } else { - // already have `remainder = yyyy0000` - // char = xxxx xxxx xxxx - // ^^^^ ^^^^^^^^^ - // remainder = yyyyxxxx write after remainder - // remainder = 00000000 - remainder &= static_cast(chunk[i]) >> 8 & 0xF0; - fwrite(&remainder, sizeof(remainder), 1, t_out); - if constexpr (debug_mode) { - std::printf("writing remainder: %x\t\t", remainder); - } - auto temp = static_cast(chunk[i]); - fwrite(&temp, sizeof(temp), 1, t_out); - if constexpr (debug_mode) { - std::printf("writing: %x\n", temp); - } - remainder = 0x00; - } - } - if(remainder != 0) { - fwrite(&remainder, sizeof(remainder), 1, t_out); + fwrite(&char_size, sizeof(char_size), 1, t_out); + fwrite(&size, sizeof(size), 1, t_out); + for (const auto &chunk : t_text) { + if constexpr (debug_mode) { + std::printf("Size of chunk: %zu\n", chunk.size()); } + write_chunk(t_out, chunk); + } +} + +/** + * \param t_out Output file + * \param t_chunk Chunk to be written to \p t_out + */ +void write_chunk(FILE *const t_out, const vuint32 &t_chunk) { + const auto chunk_size = static_cast(t_chunk.size()); + fwrite(&chunk_size, sizeof(chunk_size), 1, t_out); + std::array data{}; + for (size_t i = 0; i < t_chunk.size(); ++i) { + data.fill(0); + if (i % 2 == 0) { + data[0] = static_cast(t_chunk[i] >> 4); + data[1] = static_cast(t_chunk[i] << 4); + } else { + data[1] |= static_cast(t_chunk[i] >> 8) & 0x0F; + data[2] = static_cast(t_chunk[i]); + fwrite(data.data(), sizeof(data[0]), 3, t_out); + } + } + if (t_chunk.size() % 2 != 0) { + fwrite(data.data(), sizeof(data[0]), 3, t_out); } } diff --git a/src/io.hh b/src/io.hh index 4f9f358..78b15fa 100644 --- a/src/io.hh +++ b/src/io.hh @@ -8,6 +8,7 @@ #include #include +#include #include /* @@ -26,6 +27,9 @@ /// \brief Écrit dans le fichier le texte compressé -void write_file(FILE *, std::vector> &); +void write_file(FILE *const, const std::vector> &); + +/// \brief Écrit un chunk dans le fichier de sortie +void write_chunk(FILE *const, const std::vector &); #endif /* LZW_SRC_IO_H_ */ diff --git a/src/utf8.cc b/src/utf8.cc deleted file mode 100644 index 0086007..0000000 --- a/src/utf8.cc +++ /dev/null @@ -1,52 +0,0 @@ -/** - * \file utf8.cc - * \brief Implementation for UTF-8 related functions - */ - -#include "utf8.hh" -#include - -using FILE = std::FILE; -using uint8_t = std::uint8_t; -using uint32_t = std::uint32_t; -using ustring = std::basic_string; // chaine non encodée - -/** - * Les caractères \c passés en argument sont écrit dans le fichier de sortie au - * format UTF-8 - * - * \param[in] out Fichier de sortie - * \param[in] c Caractères à écrire dans \p out - */ -void write_utf8(FILE* t_out, uint32_t t_c) { - if(t_c < 128) { - fwrite(&t_c, sizeof(unsigned char), 1, t_out); - return; - } - size_t loops = 0; - unsigned char header = 0; - if (t_c < 2048) { - loops = 1; - header = 0xC0; - } else if (t_c < 65536) { - loops = 2; - header = 0xE0; - } else if (t_c < 2097152) { - loops = 3; - header = 0xF0; - } else if (t_c < 67108864) { - loops = 4; - header = 0xF8; - } else { - loops = 5; - header = 0xFC; - } - - ustring str(loops + 1, 0); - for (size_t i = 0; i <= loops; ++i) { - str[i] = static_cast( - ((t_c & ((i == loops) ? 0x3F : 0xFF)) >> ((loops - i) * 6)) + - ((i == 0) ? header : 0x80)); - } - fwrite(str.data(), sizeof(unsigned char), str.size(), t_out); -} diff --git a/src/utf8.hh b/src/utf8.hh deleted file mode 100644 index 4b51be2..0000000 --- a/src/utf8.hh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * \file utf8.hh - * \brief Header for UTF-8 related functions - */ - -#ifndef LZW_SRC_UTF8_H_ -#define LZW_SRC_UTF8_H_ - -#include -#include - -/* - L’encodage des caractères se fait en UTF-8 - char < 128 => "0xxxxxxx" 7bits - char < 2,048 => "110xxxxx 10xxxxxx" 11bits - char < 65,536 => "1110xxxx 10xxxxxx 10xxxxxx" 16bits - char < 2,097,152 => "11110xxx 10xxxxxx 10xxxxxx 10xxxxxx" 21bits - char < 67,108,864 => "111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx" 26bits - char < 2,147,483,648 => "1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx" 31bits -*/ - - -/// \brief Écrit les caractères au format UTF-8 -void write_utf8(std::FILE* t_out, std::uint32_t t_c); - -#endif /* LZW_SRC_UTF8_H_ */