Changed I/O, compression behavior, removed threads
This commit is contained in:
parent
2cfb560153
commit
72c71c306f
120
src/compress.cc
120
src/compress.cc
@ -4,39 +4,39 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "compress.hh"
|
#include "compress.hh"
|
||||||
#include "utf8.hh"
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include "io.hh"
|
#include "io.hh"
|
||||||
|
using std::vector;
|
||||||
|
using std::uint8_t;
|
||||||
|
using std::uint32_t;
|
||||||
|
using std::string;
|
||||||
|
using vuint32 = vector<uint32_t>;
|
||||||
|
using vvuint32 = vector<vuint32>;
|
||||||
|
using std::string;
|
||||||
|
|
||||||
using dict_t = std::map<std::pair<uint32_t, uint8_t>, uint32_t>;
|
using dict_t = std::map<std::pair<uint32_t, uint8_t>, uint32_t>;
|
||||||
using ustring = std::basic_string<uint8_t>; // chaîne non encodée
|
using ustring = std::basic_string<uint8_t>; // chaîne non encodée
|
||||||
using uvec = std::vector<std::uint32_t>; // chaîne encodée
|
using uvec = std::vector<std::uint32_t>; // chaîne encodée
|
||||||
using std::printf;
|
using std::printf;
|
||||||
|
|
||||||
constexpr size_t CHUNK_SIZE = 32768;
|
// constexpr size_t CHUNK_SIZE = 32768;
|
||||||
|
|
||||||
/**
|
constexpr int ipow(int base, int exp) {
|
||||||
*
|
int result = 1;
|
||||||
* Reçoit une liste de paires std::thread/vecteurs, le premier étant le
|
for (;;) {
|
||||||
* processus dont sa sortie est stockée dans le second. La sortie, une liste
|
if (exp & 1) {
|
||||||
* de caractères uint32_t, est écrite dans le fichier de sortie \p out.
|
result *= base;
|
||||||
*
|
|
||||||
* \param[in] t_threads
|
|
||||||
* \param[out] t_out
|
|
||||||
*/
|
|
||||||
void join_and_write(
|
|
||||||
std::vector<std::pair<std::unique_ptr<std::thread>, uvec>> &t_threads,
|
|
||||||
std::vector<std::vector<std::uint32_t>> &compressed_text) {
|
|
||||||
for (auto &elem : t_threads) {
|
|
||||||
(*elem.first).join();
|
|
||||||
}
|
}
|
||||||
for (auto &elem : t_threads) {
|
exp >>= 1;
|
||||||
compressed_text.push_back(std::move(elem.second));
|
if (exp == 0) {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
t_threads.clear();
|
base *= base;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -49,26 +49,36 @@ void join_and_write(
|
|||||||
* \param[in] t_text Chaîne de caractères uint8_t représentant le fichier d'entrée
|
* \param[in] t_text Chaîne de caractères uint8_t représentant le fichier d'entrée
|
||||||
* \param[out] t_res Chaîne de caractères de sortie
|
* \param[out] t_res Chaîne de caractères de sortie
|
||||||
*/
|
*/
|
||||||
void lzw_compress(const std::vector<char> &t_text, uvec &t_res) {
|
vvuint32 lzw_compress(string &&t_text) {
|
||||||
dict_t dictionary{};
|
|
||||||
std::puts("Compressing...");
|
std::puts("Compressing...");
|
||||||
uint32_t w = 0xFFFF;
|
uint32_t w = 0xFFFF;
|
||||||
|
vuint32 chunk{};
|
||||||
|
vvuint32 res{};
|
||||||
|
dict_t dict{};
|
||||||
|
|
||||||
constexpr size_t DICT_MAX = 7936; /* 12 bits */
|
constexpr size_t DICT_MAX = ipow(2, 13) - 256; /* 12 bits */
|
||||||
|
|
||||||
for (const auto &c : t_text) {
|
for(const auto c : t_text) {
|
||||||
if (dictionary.size() >= DICT_MAX) {
|
if(dict.size() >= DICT_MAX) {
|
||||||
t_res.push_back(static_cast<uint32_t>(w));
|
// Dictionary full -> chunk pushed, dict emptied
|
||||||
w = static_cast<uint32_t>(c);
|
res.push_back(std::move(chunk));
|
||||||
} else if (const auto &[exists, pos] =
|
chunk = vuint32{};
|
||||||
dico(dictionary, w, static_cast<std::uint8_t>(c));
|
dict = dict_t{};
|
||||||
exists) {
|
w = 0xFFFF;
|
||||||
|
}
|
||||||
|
if (const auto &[yes, pos] = dico(dict, w, static_cast<uint8_t>(c)); yes) {
|
||||||
w = pos;
|
w = pos;
|
||||||
} else {
|
} else {
|
||||||
t_res.push_back(static_cast<uint32_t>(w));
|
chunk.push_back(static_cast<uint32_t>(w));
|
||||||
w = static_cast<std::uint8_t>(c);
|
w = static_cast<uint32_t>(c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if(w != 0xFFFF) {
|
||||||
|
chunk.push_back(w);
|
||||||
|
res.push_back(std::move(chunk));
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -91,8 +101,10 @@ void compress(const std::string &t_in_file, const char *t_out_file) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Fichier de sortie
|
// Fichier de sortie
|
||||||
FILE *out =
|
FILE *const out =
|
||||||
(t_out_file != nullptr) ? fopen(t_out_file, "wb") : fopen("output.lzw", "wb");
|
(t_out_file != nullptr) ? fopen(t_out_file, "wb") : fopen("output.lzw", "wb");
|
||||||
|
// std::ofstream out{(t_out_file != nullptr) ? t_out_file : "output.lzw",
|
||||||
|
// std::ios::binary};
|
||||||
if (out == nullptr) {
|
if (out == nullptr) {
|
||||||
std::cerr << "Error at " << __FILE__ << ":" << __LINE__ - 4
|
std::cerr << "Error at " << __FILE__ << ":" << __LINE__ - 4
|
||||||
<< ": could not open output file. Aborting...\n";
|
<< ": could not open output file. Aborting...\n";
|
||||||
@ -100,47 +112,9 @@ void compress(const std::string &t_in_file, const char *t_out_file) {
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// collection of chunks
|
const auto compressed_text{
|
||||||
std::vector<std::vector<std::uint32_t>> compressed_text{};
|
lzw_compress(std::string{std::istreambuf_iterator<char>(input_file),
|
||||||
|
std::istreambuf_iterator<char>()})};
|
||||||
// thread pool
|
|
||||||
std::vector<std::pair<std::unique_ptr<std::thread>, uvec>> threads{};
|
|
||||||
|
|
||||||
// chunk chars
|
|
||||||
std::vector<char> chunk(CHUNK_SIZE, 0);
|
|
||||||
while (input_file.read(chunk.data(),
|
|
||||||
static_cast<std::streamsize>(chunk.size()))) {
|
|
||||||
threads.emplace_back(nullptr, uvec{});
|
|
||||||
threads.back().second.reserve(CHUNK_SIZE);
|
|
||||||
threads.back().first = std::make_unique<std::thread>(
|
|
||||||
std::thread{lzw_compress, chunk, ref(threads.back().second)});
|
|
||||||
assert(threads.back().first);
|
|
||||||
if (threads.size() >= 8) {
|
|
||||||
join_and_write(threads, compressed_text);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!threads.empty()) {
|
|
||||||
join_and_write(threads, compressed_text);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (input_file.tellg() != std::ios::end) {
|
|
||||||
std::puts("Leftovers, compressing...");
|
|
||||||
{
|
|
||||||
const auto prev_pos = input_file.tellg();
|
|
||||||
input_file.seekg(0, std::ios::end);
|
|
||||||
chunk.reserve(static_cast<size_t>(input_file.tellg() - prev_pos));
|
|
||||||
input_file.seekg(prev_pos, std::ios::beg);
|
|
||||||
std::istreambuf_iterator<char> itr(input_file);
|
|
||||||
for (std::streamoff i = 0; i < prev_pos; ++i, ++itr){
|
|
||||||
;
|
|
||||||
}
|
|
||||||
chunk.assign((itr), std::istreambuf_iterator<char>());
|
|
||||||
}
|
|
||||||
uvec ret{};
|
|
||||||
lzw_compress(chunk, ret);
|
|
||||||
compressed_text.push_back(std::move(ret));
|
|
||||||
}
|
|
||||||
|
|
||||||
write_file(out, compressed_text);
|
write_file(out, compressed_text);
|
||||||
|
|
||||||
|
@ -10,14 +10,14 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
#include <optional>
|
||||||
/// \brief Exécution des threads et écriture de leur résultat dans le fichier de sortie
|
|
||||||
void join_and_write(std::vector<std::pair<std::unique_ptr<std::thread>,
|
|
||||||
std::vector<std::uint32_t>>> &,
|
|
||||||
std::vector<std::vector<std::uint32_t>> &);
|
|
||||||
|
|
||||||
/// \brief Compression d'une chaine de caractères
|
/// \brief Compression d'une chaine de caractères
|
||||||
void lzw_compress(const std::vector<char> &, std::vector<std::uint32_t> &);
|
std::vector<std::vector<std::uint32_t>> lzw_compress(std::string &&);
|
||||||
|
|
||||||
|
std::optional<std::uint32_t>
|
||||||
|
lzw_compress_char(std::vector<std::vector<std::uint32_t>> &,
|
||||||
|
std::vector<std::uint32_t> &, const char);
|
||||||
|
|
||||||
/// \brief Wrapper de \ref lzw_compress
|
/// \brief Wrapper de \ref lzw_compress
|
||||||
void compress(const std::string &, const char *);
|
void compress(const std::string &, const char *);
|
||||||
|
80
src/io.cc
80
src/io.cc
@ -4,13 +4,24 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "io.hh"
|
#include "io.hh"
|
||||||
|
#include <array>
|
||||||
|
|
||||||
#ifdef Debug
|
#ifdef Debug
|
||||||
constexpr bool debug_mode = true;
|
constexpr bool debug_mode = true;
|
||||||
|
#include <algorithm>
|
||||||
#else
|
#else
|
||||||
constexpr bool debug_mode = false;
|
constexpr bool debug_mode = false;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
using std::vector;
|
||||||
|
using std::uint32_t;
|
||||||
|
using vuint32 = vector<uint32_t>;
|
||||||
|
using vvuint32 = vector<vuint32>;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
constexpr unsigned char char_size = 12;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Écrit dans le fichier \p t_out les chunks passés en paramètre. Le fichier de
|
* Écrit dans le fichier \p t_out les chunks passés en paramètre. Le fichier de
|
||||||
* sortie est composé des éléments suivants :\n
|
* sortie est composé des éléments suivants :\n
|
||||||
@ -35,61 +46,42 @@ constexpr bool debug_mode = false;
|
|||||||
* \param[out] t_out Fichier de sortie
|
* \param[out] t_out Fichier de sortie
|
||||||
* \param[in] t_text Collection ordonnée des chunks à écrire dans \p t_out
|
* \param[in] t_text Collection ordonnée des chunks à écrire dans \p t_out
|
||||||
*/
|
*/
|
||||||
void write_file(FILE *t_out, std::vector<std::vector<std::uint32_t>> &t_text) {
|
void write_file(FILE *const t_out, const vvuint32 &t_text) {
|
||||||
{
|
const auto size = static_cast<uint32_t>(t_text.size());
|
||||||
uint32_t char_size = 12;
|
|
||||||
if constexpr (debug_mode) {
|
if constexpr (debug_mode) {
|
||||||
std::printf("Char size: %u\n", char_size);
|
std::printf("Char size: %u\n", char_size);
|
||||||
}
|
|
||||||
fwrite(&char_size, sizeof(uint32_t), 1, t_out);
|
|
||||||
auto size = static_cast<uint32_t>(t_text.size());
|
|
||||||
if constexpr (debug_mode) {
|
|
||||||
std::printf("Number of chunks: %u\n", size);
|
std::printf("Number of chunks: %u\n", size);
|
||||||
}
|
}
|
||||||
fwrite(&size, sizeof(uint32_t), 1, t_out);
|
fwrite(&char_size, sizeof(char_size), 1, t_out);
|
||||||
}
|
fwrite(&size, sizeof(size), 1, t_out);
|
||||||
for (const auto &chunk : t_text) {
|
for (const auto &chunk : t_text) {
|
||||||
// write size of chunk in uint32_t
|
|
||||||
{
|
|
||||||
auto size = static_cast<uint32_t>(chunk.size());
|
|
||||||
if constexpr (debug_mode) {
|
if constexpr (debug_mode) {
|
||||||
std::printf("Size of chunk: %u\n", size);
|
std::printf("Size of chunk: %zu\n", chunk.size());
|
||||||
}
|
}
|
||||||
fwrite(&size, sizeof(uint32_t), 1, t_out);
|
write_chunk(t_out, chunk);
|
||||||
}
|
}
|
||||||
uint8_t remainder = 0x00;
|
}
|
||||||
for(size_t i = 0; i < chunk.size(); ++i) {
|
|
||||||
|
/**
|
||||||
|
* \param t_out Output file
|
||||||
|
* \param t_chunk Chunk to be written to \p t_out
|
||||||
|
*/
|
||||||
|
void write_chunk(FILE *const t_out, const vuint32 &t_chunk) {
|
||||||
|
const auto chunk_size = static_cast<uint32_t>(t_chunk.size());
|
||||||
|
fwrite(&chunk_size, sizeof(chunk_size), 1, t_out);
|
||||||
|
std::array<unsigned char, 3> data{};
|
||||||
|
for (size_t i = 0; i < t_chunk.size(); ++i) {
|
||||||
|
data.fill(0);
|
||||||
if (i % 2 == 0) {
|
if (i % 2 == 0) {
|
||||||
// char = xxxx xxxx xxxx
|
data[0] = static_cast<unsigned char>(t_chunk[i] >> 4);
|
||||||
// ^^^^^^^^^ ^^^^
|
data[1] = static_cast<unsigned char>(t_chunk[i] << 4);
|
||||||
// write keep in remainder as xxxx0000
|
|
||||||
auto temp = static_cast<unsigned char>(chunk[i] >> 4);
|
|
||||||
fwrite(&temp, sizeof(temp), 1, t_out);
|
|
||||||
if constexpr (debug_mode) {
|
|
||||||
std::printf("writing: %x\t\t", temp);
|
|
||||||
}
|
|
||||||
remainder = static_cast<uint8_t>(chunk[i] << 4);
|
|
||||||
} else {
|
} else {
|
||||||
// already have `remainder = yyyy0000`
|
data[1] |= static_cast<unsigned char>(t_chunk[i] >> 8) & 0x0F;
|
||||||
// char = xxxx xxxx xxxx
|
data[2] = static_cast<unsigned char>(t_chunk[i]);
|
||||||
// ^^^^ ^^^^^^^^^
|
fwrite(data.data(), sizeof(data[0]), 3, t_out);
|
||||||
// remainder = yyyyxxxx write after remainder
|
|
||||||
// remainder = 00000000
|
|
||||||
remainder &= static_cast<unsigned char>(chunk[i]) >> 8 & 0xF0;
|
|
||||||
fwrite(&remainder, sizeof(remainder), 1, t_out);
|
|
||||||
if constexpr (debug_mode) {
|
|
||||||
std::printf("writing remainder: %x\t\t", remainder);
|
|
||||||
}
|
|
||||||
auto temp = static_cast<unsigned char>(chunk[i]);
|
|
||||||
fwrite(&temp, sizeof(temp), 1, t_out);
|
|
||||||
if constexpr (debug_mode) {
|
|
||||||
std::printf("writing: %x\n", temp);
|
|
||||||
}
|
|
||||||
remainder = 0x00;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(remainder != 0) {
|
if (t_chunk.size() % 2 != 0) {
|
||||||
fwrite(&remainder, sizeof(remainder), 1, t_out);
|
fwrite(data.data(), sizeof(data[0]), 3, t_out);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <iostream>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -26,6 +27,9 @@
|
|||||||
|
|
||||||
|
|
||||||
/// \brief Écrit dans le fichier le texte compressé
|
/// \brief Écrit dans le fichier le texte compressé
|
||||||
void write_file(FILE *, std::vector<std::vector<std::uint32_t>> &);
|
void write_file(FILE *const, const std::vector<std::vector<std::uint32_t>> &);
|
||||||
|
|
||||||
|
/// \brief Écrit un chunk dans le fichier de sortie
|
||||||
|
void write_chunk(FILE *const, const std::vector<std::uint32_t> &);
|
||||||
|
|
||||||
#endif /* LZW_SRC_IO_H_ */
|
#endif /* LZW_SRC_IO_H_ */
|
||||||
|
52
src/utf8.cc
52
src/utf8.cc
@ -1,52 +0,0 @@
|
|||||||
/**
|
|
||||||
* \file utf8.cc
|
|
||||||
* \brief Implementation for UTF-8 related functions
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "utf8.hh"
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
using FILE = std::FILE;
|
|
||||||
using uint8_t = std::uint8_t;
|
|
||||||
using uint32_t = std::uint32_t;
|
|
||||||
using ustring = std::basic_string<uint8_t>; // chaine non encodée
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Les caractères \c passés en argument sont écrit dans le fichier de sortie au
|
|
||||||
* format UTF-8
|
|
||||||
*
|
|
||||||
* \param[in] out Fichier de sortie
|
|
||||||
* \param[in] c Caractères à écrire dans \p out
|
|
||||||
*/
|
|
||||||
void write_utf8(FILE* t_out, uint32_t t_c) {
|
|
||||||
if(t_c < 128) {
|
|
||||||
fwrite(&t_c, sizeof(unsigned char), 1, t_out);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
size_t loops = 0;
|
|
||||||
unsigned char header = 0;
|
|
||||||
if (t_c < 2048) {
|
|
||||||
loops = 1;
|
|
||||||
header = 0xC0;
|
|
||||||
} else if (t_c < 65536) {
|
|
||||||
loops = 2;
|
|
||||||
header = 0xE0;
|
|
||||||
} else if (t_c < 2097152) {
|
|
||||||
loops = 3;
|
|
||||||
header = 0xF0;
|
|
||||||
} else if (t_c < 67108864) {
|
|
||||||
loops = 4;
|
|
||||||
header = 0xF8;
|
|
||||||
} else {
|
|
||||||
loops = 5;
|
|
||||||
header = 0xFC;
|
|
||||||
}
|
|
||||||
|
|
||||||
ustring str(loops + 1, 0);
|
|
||||||
for (size_t i = 0; i <= loops; ++i) {
|
|
||||||
str[i] = static_cast<unsigned char>(
|
|
||||||
((t_c & ((i == loops) ? 0x3F : 0xFF)) >> ((loops - i) * 6)) +
|
|
||||||
((i == 0) ? header : 0x80));
|
|
||||||
}
|
|
||||||
fwrite(str.data(), sizeof(unsigned char), str.size(), t_out);
|
|
||||||
}
|
|
26
src/utf8.hh
26
src/utf8.hh
@ -1,26 +0,0 @@
|
|||||||
/**
|
|
||||||
* \file utf8.hh
|
|
||||||
* \brief Header for UTF-8 related functions
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef LZW_SRC_UTF8_H_
|
|
||||||
#define LZW_SRC_UTF8_H_
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdint>
|
|
||||||
|
|
||||||
/*
|
|
||||||
L’encodage des caractères se fait en UTF-8
|
|
||||||
char < 128 => "0xxxxxxx" 7bits
|
|
||||||
char < 2,048 => "110xxxxx 10xxxxxx" 11bits
|
|
||||||
char < 65,536 => "1110xxxx 10xxxxxx 10xxxxxx" 16bits
|
|
||||||
char < 2,097,152 => "11110xxx 10xxxxxx 10xxxxxx 10xxxxxx" 21bits
|
|
||||||
char < 67,108,864 => "111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx" 26bits
|
|
||||||
char < 2,147,483,648 => "1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx" 31bits
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
/// \brief Écrit les caractères au format UTF-8
|
|
||||||
void write_utf8(std::FILE* t_out, std::uint32_t t_c);
|
|
||||||
|
|
||||||
#endif /* LZW_SRC_UTF8_H_ */
|
|
Loading…
Reference in New Issue
Block a user