diff --git a/argsparser.cpp b/argsparser.cpp index d4d43b7..76caf0a 100644 --- a/argsparser.cpp +++ b/argsparser.cpp @@ -4,7 +4,7 @@ const string ArgsParser::SPACES(24, ' '); string ArgsParser::formatHelp(const string& help) const { if (help.find('\n') == std::string::npos) return help; - std::vector elems = strutils::split(help, '\n'); + std::vector elems = stringutils::split(help, '\n'); fmt::MemoryWriter w; w.write("{}", elems[0]); for (size_t i = 1; i < elems.size(); i++) { diff --git a/fmt b/fmt index 796beaa..d8754af 160000 --- a/fmt +++ b/fmt @@ -1 +1 @@ -Subproject commit 796beaaddb5226162fe00c2c55e322d80d26f3d8 +Subproject commit d8754af0639bed3ebdbba6e1fff5057b8ba6e093 diff --git a/gzipio.cpp b/gzipio.cpp new file mode 100644 index 0000000..50aee6c --- /dev/null +++ b/gzipio.cpp @@ -0,0 +1,138 @@ +#include "gzipio.h" + +const std::unordered_set GZipOut::gzip_ext_set{ + {".gz", ".zip", ".7z", ".bzip2", ".bz2"}}; +const size_t GZipOut::MxBfL = 4 * 1024; + +void GZipOut::FlushBf() { + size_t BytesOut = fwrite(Bf, 1, BfL, ZipStdinWr); + assert(BytesOut == BfL); + BfL = 0; +} + +void GZipOut::CreateZipProcess(const std::string& cmd, + const std::string& zip_fnm) { + std::string cmd_line = + fmt::format("{} {}", cmd.c_str(), zip_fnm.c_str()); + cmd_line += " >/dev/null"; + ZipStdinWr = popen(cmd_line.c_str(), "w"); + assert_msg(ZipStdinWr != NULL, "Can not execute '%s'", + cmd_line.c_str()); +} + +GZipOut::GZipOut(const std::string& filename) + : ZipStdinRd(NULL), ZipStdinWr(NULL), Bf(NULL), BfL(0) { + CreateZipProcess(getCmd(filename), filename); + Bf = new char[MxBfL]; + BfL = 0; +} + +GZipOut::~GZipOut() { + close(); + if (Bf != NULL) delete[] Bf; +} + +void GZipOut::close() { + if (BfL != 0) FlushBf(); + if (ZipStdinWr != NULL) { + assert_msg(pclose(ZipStdinWr) != -1, + "Closing of the process failed"); + ZipStdinWr = NULL; + } +} + +int GZipOut::putChar(const char& Ch) { + if (BfL == MxBfL) FlushBf(); + return Bf[BfL++] = Ch; +} + +void GZipOut::write(const void* data, const size_t length) { + if (BfL + length > MxBfL) { + for (size_t LBfC = 0; LBfC < length; LBfC++) + putChar(((char*)data)[LBfC]); + } else { + for (size_t LBfC = 0; LBfC < length; LBfC++) + Bf[BfL++] = ((char*)data)[LBfC]; + } +} + +void GZipOut::Flush() { FlushBf(); } + +bool GZipOut::isZip(const std::string& filename) { + std::string base, name, ext; + stringutils::splitFilename(filename, base, name, ext); + return gzip_ext_set.find(ext) != gzip_ext_set.end(); +} + +std::string GZipOut::getCmd(const std::string& zip_fnm) { + std::string base, name, ext; + stringutils::splitFilename(zip_fnm, base, name, ext); + assert_msg(gzip_ext_set.find(ext) != gzip_ext_set.end(), + "Unknown file extension '%s'.", ext.c_str()); + return "7za a -y -bd -si" + name; +} + +///////////////////////////////////////////////////////// +const std::unordered_set GZipIn::gzip_ext_set{ + {".gz", ".zip", ".7z", ".bzip2", ".bz2"}}; +const int GZipIn::MxBfL = 32 * 1024; + +void GZipIn::CreateZipProcess(const std::string& cmd, + const std::string& zip_fnm) { + std::string cmd_line = + fmt::format("{} {}", cmd.c_str(), zip_fnm.c_str()); + cmd_line += " 2>/dev/null"; + ZipStdoutRd = popen(cmd_line.c_str(), "r"); + assert_msg(ZipStdoutRd != NULL, "Can not execute '%s'", + cmd_line.c_str()); +} + +void GZipIn::FillBf() { + size_t BytesRead = fread(Bf, 1, MxBfL, ZipStdoutRd); + BfL = (int)BytesRead; + CurFPos += BytesRead; + BfC = 0; +} + +GZipIn::GZipIn(const std::string& filename) + : ZipStdoutRd(NULL), ZipStdoutWr(NULL), CurFPos(0), + Bf(NULL), BfC(0), BfL(0) { + CreateZipProcess(getCmd(filename), filename); + Bf = new char[MxBfL]; + BfC = BfL = -1; + FillBf(); +} + +GZipIn::~GZipIn() { + if (ZipStdoutRd != NULL) + assert_msg(pclose(ZipStdoutRd) != -1, + "Closing of the process failed"); + if (Bf != NULL) delete[] Bf; +} + +size_t GZipIn::read(const void* LBf, const size_t LBfL) { + size_t LBfS = 0; + if (BfC + LBfL > BfL) { + for (size_t LBfC = 0; LBfC < LBfL; LBfC++) { + if (BfC == BfL) FillBf(); + LBfS += ((char*)LBf)[LBfC] = Bf[BfC++]; + } + } else { + for (size_t LBfC = 0; LBfC < LBfL; LBfC++) + LBfS += (((char*)LBf)[LBfC] = Bf[BfC++]); + } + return LBfS; +} + +bool GZipIn::isZip(const std::string& filename) { + std::string base, name, ext; + stringutils::splitFilename(filename, base, name, ext); + return gzip_ext_set.find(ext) != gzip_ext_set.end(); +} + +std::string GZipIn::getCmd(const std::string& zip_fnm) { + assert_msg(isZip(zip_fnm), + "Unsupported file extension '%s'", + zip_fnm.c_str()); + return "7za e -y -bd -so"; +} diff --git a/gzipio.h b/gzipio.h new file mode 100644 index 0000000..fa3f945 --- /dev/null +++ b/gzipio.h @@ -0,0 +1,87 @@ +#ifndef __GZIPIO_H__ +#define __GZIPIO_H__ +// Compressed input and output streams. +// 7za.exe or 7z.exe must be in the path +// (http://www.7-zip.org) +// 7za.exe is a stand-alone program, which supports +// -- extraction: .gz, .7z, .rar, .zip, .cab, .arj. bzip2 +// -- compression: .7z +// 7z.exe uses DLLs in folders Codecs and Formats +// -- extraction: .gz, .7z, .rar, .zip, .cab, .arj. bzip2 +// -- compression: .gz, .7z, .rar, .zip, .cab, .arj. bzip2 + +#include +#include +#include + +#include +#include + +#include +#define assert_msg PPK_ASSERT + +class GZipOut { +private: + static const size_t MxBfL; + static const std::unordered_set gzip_ext_set; + FILE *ZipStdinRd, *ZipStdinWr; + char* Bf; + size_t BfL; + +private: + void FlushBf(); + void CreateZipProcess(const std::string& cmd, + const std::string& filename); + +public: + GZipOut(const std::string& filename); + ~GZipOut(); + + int putChar(const char& ch); + void write(const void* data, const size_t length); + void save(const char* str) { write(str, strlen(str)); } + void save(const int val) { write(&val, sizeof(int)); } + void save(const long val) { write(&val, sizeof(long)); } + void save(const double val) { write(&val, sizeof(double)); } + + void Flush(); + void close(); + + static bool isZip(const std::string& filename); + std::string getCmd(const std::string& filename); +}; + +class GZipIn { +private: + static const int MxBfL; + static const std::unordered_set gzip_ext_set; + FILE *ZipStdoutRd, *ZipStdoutWr; + size_t CurFPos; + char* Bf; + int BfC, BfL; + +private: + void FillBf(); + void CreateZipProcess(const std::string& cmd, + const std::string& zip_fnm); + +public: + GZipIn(const std::string& filename); + ~GZipIn(); + + bool eof() { return BfL < MxBfL && BfC == BfL; } + char getChar() { + if (BfC == BfL) FillBf(); + return Bf[BfC++]; + } + + size_t read(const void* LBf, const size_t LBfL); + void load(int& val) { read(&val, sizeof(int)); } + void load(long& val) { read(&val, sizeof(long)); } + void load(double& val) { read(&val, sizeof(double)); } + + std::string getCmd(const std::string& zip_fnm); + static bool isZip(const std::string& filename); +}; + +#endif /* __GZIPIO_H__ */ diff --git a/ioutils.h b/ioutils.h new file mode 100644 index 0000000..6afe411 --- /dev/null +++ b/ioutils.h @@ -0,0 +1,16 @@ +#ifndef __IOUTILS_H__ +#define __IOUTILS_H__ + +#include + +template +saveMap(std::unordered_map map_to_save, + std::string zip_filename) { + GZipOut gzo(zip_filename); + for (auto& pr : map_to_save) { + gzo.save(fmt::format("{}\t{}\n", source, id, pr.second) + .c_str()); + } +} + +#endif /* __IOUTILS_H__ */ diff --git a/lz4 b/lz4 new file mode 160000 index 0000000..7bb64ff --- /dev/null +++ b/lz4 @@ -0,0 +1 @@ +Subproject commit 7bb64ff2b69a9f8367de9ab483cdadf42b4c1b65 diff --git a/lz4io/lz4io.cpp b/lz4io/lz4io.cpp new file mode 100644 index 0000000..aff90b2 --- /dev/null +++ b/lz4io/lz4io.cpp @@ -0,0 +1,193 @@ +#include "lz4io.h" + +namespace lz4 { + +size_t write_int(FILE* fp, const int i) { + return fwrite(&i, sizeof(int), 1, fp); +} + +size_t write_bin(FILE* fp, const void* array, + const size_t arrayBytes) { + return fwrite(array, 1, arrayBytes, fp); +} + +size_t read_int(FILE* fp, int* i) { + return fread(i, sizeof(int), 1, fp); +} + +size_t read_bin(FILE* fp, void* array, + const size_t arrayBytes) { + return fread(array, 1, arrayBytes, fp); +} + +//============================================================ + +LZ4Out::LZ4Out() { + data_buf_ = new char[DATA_CAPACITY]; + chunk_buf_ = new char[CHUNK_CAPACITY]; + if (data_buf_ == NULL || chunk_buf_ == NULL) { + fprintf(stderr, "Allocate space failed!\n"); + exit(1); + } + len_dat_ = 0; + output_ = NULL; +} + +LZ4Out::~LZ4Out() { + close(); + if (data_buf_ != NULL) delete[] data_buf_; + if (chunk_buf_ != NULL) delete[] chunk_buf_; +} + +void LZ4Out::close() { + if (output_ != NULL) { + writeChunk(); + fclose(output_); + output_ = NULL; + } +} + +void LZ4Out::open(const char* file_name, const bool append) { + close(); + const char* mode = append ? "ab" : "wb"; + output_ = fopen(file_name, mode); + if (output_ == NULL) { + fprintf(stderr, "Open file '%s' failed!\n", file_name); + exit(1); + } +} + +void LZ4Out::write(const void* data, const size_t length) { + size_t written = 0; + while (length - written > 0) { + size_t len_available = DATA_CAPACITY - len_dat_; + if (len_available == 0) writeChunk(); + size_t num_to_write = + std::min(length - written, len_available); + memcpy(data_buf_ + len_dat_, (char*)data + written, + num_to_write); + len_dat_ += num_to_write; + written += num_to_write; + } +} + +void LZ4Out::writeChunk() { + if (len_dat_ == 0) return; + size_t chunk_len = LZ4_compress_fast( + data_buf_, chunk_buf_, len_dat_, CHUNK_CAPACITY, 9); + if (chunk_len > 0) { + write_int(output_, chunk_len); + write_bin(output_, chunk_buf_, chunk_len); + } + len_dat_ = 0; +} + +void LZ4Out::compress(const char* file_name) { + FILE* file_id = fopen(file_name, "rb"); + if (file_id == NULL) { + fprintf(stderr, "Open file '%s' failed!\n", file_name); + exit(1); + } + char buffer[1024]; + size_t num_read; + while (true) { + num_read = fread(buffer, 1, 1024, file_id); + write(buffer, num_read); + if (num_read < 1024) break; + } + fclose(file_id); + close(); +} + +//================================================= + +LZ4In::LZ4In() { + chunk_buf_ = new char[CHUNK_CAPACITY]; + data_buf_ = new char[DATA_CAPACITY]; + if (data_buf_ == NULL || chunk_buf_ == NULL) { + fprintf(stderr, "Allocate space failed!\n"); + exit(1); + } + input_ = NULL; + len_data_ = num_read_ = 0; +} + +LZ4In::~LZ4In() { + close(); + if (chunk_buf_ != NULL) delete[] chunk_buf_; + if (data_buf_ != NULL) delete[] data_buf_; +} + +void LZ4In::close() { + if (input_ != NULL) { + fclose(input_); + input_ = NULL; + } + len_data_ = num_read_ = 0; +} + +void LZ4In::open(const char* file_name) { + close(); + input_ = fopen(file_name, "rb"); + if (input_ == NULL) { + fprintf(stderr, "Open file '%s' failed!\n", file_name); + exit(1); + } +} + +bool LZ4In::readOriChunk(char* data, size_t& length) { + length = 0; + int chunk_len; + size_t num_read = read_int(input_, &chunk_len); + if (num_read < 1 || chunk_len == 0) return false; + length = read_bin(input_, data, chunk_len); + return length > 0; +} + +bool LZ4In::readChunk(char* data, size_t& length) { + length = 0; + int chunk_len; + size_t num_read = read_int(input_, &chunk_len); + if (num_read < 1 || chunk_len == 0) return false; + read_bin(input_, chunk_buf_, chunk_len); + length = LZ4_decompress_safe(chunk_buf_, data, chunk_len, + (int)DATA_CAPACITY); + return length > 0; +} + +bool LZ4In::eof() { + if (num_read_ < len_data_) return false; + return !fillBuffer(); +} + +bool LZ4In::fillBuffer() { + num_read_ = 0; + return readChunk(data_buf_, len_data_); +} + +size_t LZ4In::read(const void* data, const size_t length) { + size_t have_read = 0; + while (have_read < length) { + size_t len_available = len_data_ - num_read_; + if (len_available == 0) fillBuffer(); + size_t num_to_read = + std::min(length - have_read, len_available); + memcpy((char*)data + have_read, data_buf_ + num_read_, + num_to_read); + num_read_ += num_to_read; + have_read += num_to_read; + } + return have_read; +} + +void LZ4In::decompress(const char* output_file_name) { + FILE* file_id = fopen(output_file_name, "wb"); + char buffer[1024]; + size_t num_read; + while (!eof()) { + num_read = read(buffer, 1024); + write_bin(file_id, buffer, num_read); + } + fclose(file_id); +} +} diff --git a/lz4io/lz4io.h b/lz4io/lz4io.h new file mode 100644 index 0000000..14b9946 --- /dev/null +++ b/lz4io/lz4io.h @@ -0,0 +1,131 @@ +#ifndef __LZ4IO_H__ +#define __LZ4IO_H__ + +#include +#include +#include +#include +#include + +#include + +namespace lz4 { + +enum { + BLOCK_BYTES = 65536, // 64KB +}; + +static const size_t DATA_CAPACITY = BLOCK_BYTES; +static const size_t CHUNK_CAPACITY = + LZ4_COMPRESSBOUND(BLOCK_BYTES); + +class LZ4Out { +private: + char *data_buf_, *chunk_buf_; + size_t len_dat_; + FILE* output_; + +private: + void writeChunk(); + +public: + LZ4Out(); + LZ4Out(const char* file_name, const bool append = false) + : LZ4Out() { + open(file_name, append); + } + + // do not allow copy constructor or assignment + LZ4Out(const LZ4Out&) = delete; + LZ4Out& operator=(const LZ4Out&) = delete; + + // move constructor + LZ4Out(LZ4Out&& other) + : data_buf_(std::move(other.data_buf_)), + chunk_buf_(std::move(other.chunk_buf_)), + len_dat_(other.len_dat_), + output_(std::move(other.output_)) { + other.data_buf_ = other.chunk_buf_ = NULL; + other.len_dat_ = 0; + other.output_ = NULL; + } + // move assignment + LZ4Out& operator=(LZ4Out&& other) { + data_buf_ = std::move(other.data_buf_); + chunk_buf_ = std::move(other.chunk_buf_); + len_dat_ = other.len_dat_; + output_ = std::move(other.output_); + other.data_buf_ = other.chunk_buf_ = NULL; + other.len_dat_ = 0; + other.output_ = NULL; + return *this; + } + + ~LZ4Out(); + + void open(const char* file_name, const bool append = false); + // add data to buffer, the data may be larger than buffer + // size. + void write(const void* data, const size_t length); + void save(const int val) { write(&val, sizeof(int)); } + void save(const long val) { write(&val, sizeof(long)); } + void save(const double val) { + write(&val, sizeof(double)); + }; + void close(); + bool isClosed() const { return output_ == NULL; } + + void compress(const char* input_file_name); +}; + +class LZ4In { +private: + char *chunk_buf_, *data_buf_; + size_t len_data_, num_read_; + FILE* input_; + +private: + bool fillBuffer(); + bool readOriChunk(char* data, size_t& length); + bool readChunk(char* data, size_t& length); + +public: + LZ4In(); + LZ4In(const char* file_name) : LZ4In() { + input_ = fopen(file_name, "rb"); + if (input_ == NULL) { + fprintf(stderr, "Open file '%s' failed!\n", file_name); + exit(1); + } + } + ~LZ4In(); + + // do not allow copy constructor or assignment + LZ4In(const LZ4In&) = delete; + LZ4In& operator=(const LZ4In&) = delete; + + // move constructor + LZ4In(LZ4In&& other) + : chunk_buf_(other.chunk_buf_), + data_buf_(other.data_buf_), + len_data_(other.len_data_), + num_read_(other.num_read_), input_(other.input_) { + other.chunk_buf_ = other.data_buf_ = NULL; + other.len_data_ = other.num_read_ = 0; + other.input_ = NULL; + } + + void open(const char* file_name); + void close(); + bool eof(); + + size_t read(const void* data, const size_t len); + void load(int& val) { read(&val, sizeof(int)); } + void load(long& val) { read(&val, sizeof(long)); } + void load(double& val) { read(&val, sizeof(double)); } + + void decompress(const char* output_file_name); +}; +} + +#endif /* __LZ4IO_H__ */ diff --git a/randutils.h b/randutils.h index a5b0453..4c2a781 100644 --- a/randutils.h +++ b/randutils.h @@ -94,7 +94,7 @@ #include #include #include -#include // for std::hash +#include // for std::hash #include #include #include @@ -127,7 +127,7 @@ #elif defined(_WIN64) || defined(_WIN32) #include #define RANDUTILS_GETPID _getpid() -#elif defined(__unix__) || defined(__unix) || \ +#elif defined(__unix__) || defined(__unix) || \ (defined(__APPLE__) && defined(__MACH__)) #include #define RANDUTILS_GETPID getpid() @@ -250,11 +250,13 @@ struct seed_seq_fe { seed_seq_fe(const seed_seq_fe &) = delete; void operator=(const seed_seq_fe &) = delete; - template seed_seq_fe(std::initializer_list init) { + template + seed_seq_fe(std::initializer_list init) { seed(init.begin(), init.end()); } - template seed_seq_fe(InputIter begin, InputIter end) { + template + seed_seq_fe(InputIter begin, InputIter end) { seed(begin, end); } @@ -264,14 +266,15 @@ struct seed_seq_fe { static constexpr size_t size() { return count; } - template void param(OutputIterator dest) const; + template + void param(OutputIterator dest) const; - template void seed(InputIter begin, InputIter end) { + template + void seed(InputIter begin, InputIter end) { mix_entropy(begin, end); // For very small sizes, we do some additional mixing. For normal // sizes, this loop never performs any iterations. - for (size_t i = 1; i < mix_rounds; ++i) - stir(); + for (size_t i = 1; i < mix_rounds; ++i) stir(); } seed_seq_fe &stir() { @@ -307,11 +310,9 @@ void seed_seq_fe::mix_entropy(InputIter begin, } for (auto &src : mixer_) for (auto &dest : mixer_) - if (&src != &dest) - dest = mix(dest, hash(src)); + if (&src != &dest) dest = mix(dest, hash(src)); for (; current != end; ++current) - for (auto &dest : mixer_) - dest = mix(dest, hash(*current)); + for (auto &dest : mixer_) dest = mix(dest, hash(*current)); } template @@ -362,8 +363,7 @@ void seed_seq_fe::generate( auto hash_const = INIT_B; for (auto dest = dest_begin; dest != dest_end; ++dest) { auto dataval = *src; - if (++src == src_end) - src = src_begin; + if (++src == src_end) src = src_begin; dataval ^= hash_const; hash_const *= MULT_B; dataval *= hash_const; @@ -400,10 +400,12 @@ using seed_seq_fe256 = seed_seq_fe<8, uint32_t>; * http://www.pcg-random.org/posts/cpps-random_device.html */ -template class auto_seeded : public SeedSeq { +template +class auto_seeded : public SeedSeq { using default_seeds = std::array; - template static uint32_t crushto32(T value) { + template + static uint32_t crushto32(T value) { if (sizeof(T) <= 4) return uint32_t(value); else { @@ -413,7 +415,8 @@ template class auto_seeded : public SeedSeq { } } - template static uint32_t hash(T &&value) { + template + static uint32_t hash(T &&value) { return crushto32( std::hash::type>::type>{}(std::forward(value))); @@ -579,7 +582,8 @@ class random_generator { return true; } - template static constexpr bool has_base_seed_seq(...) { + template + static constexpr bool has_base_seed_seq(...) { return false; } @@ -637,17 +641,16 @@ class random_generator { typename... Params> ResultType variate(Params &&... params) { DistTmpl dist(std::forward(params)...); - return dist(engine_); } - template Numeric uniform(Numeric lower, Numeric upper) { + template + Numeric uniform(Numeric lower, Numeric upper) { return variate(lower, upper); } - double uniform() { - return variate(0.0, 1.0); - } + // [0, 1) + double uniform() { return variate(0.0, 1.0); } double normal(double mean = 0, double variance = 1) { return variate(mean, variance); @@ -671,18 +674,20 @@ class random_generator { std::forward(params)...); } - template void shuffle(Iter first, Iter last) { + template + void shuffle(Iter first, Iter last) { std::shuffle(first, last, engine_); } - template void shuffle(Range &&range) { + template + void shuffle(Range &&range) { shuffle(std::begin(range), std::end(range)); } - template Iter choose(Iter first, Iter last) { + template + Iter choose(Iter first, Iter last) { auto dist = std::distance(first, last); - if (dist < 2) - return first; + if (dist < 2) return first; using distance_type = decltype(dist); distance_type choice = uniform(distance_type(0), --dist); std::advance(first, choice); @@ -730,6 +735,22 @@ class random_generator { using default_rng = random_generator; using mt19937_rng = random_generator; + +class RandUtils { +private: + default_rng rng; + +public: + RandUtils() { rng.seed(time(NULL)); } + + // distribution: a distribution, require that sum(distribution)=1 + const int sampleDistribution(const std::vector distribution) { + int id = 0; + double r = rng.uniform(), accu = distribution[0]; + while (r > accu) accu += distribution[++id]; + return id; + } +}; } -#endif // RANDUTILS_HPP +#endif // RANDUTILS_HPP diff --git a/stringutils.cpp b/stringutils.cpp index 802e1db..b1cb0fa 100644 --- a/stringutils.cpp +++ b/stringutils.cpp @@ -1,22 +1,20 @@ #include "stringutils.h" -namespace strutils { -void splitFilename(const std::string &fullname, - std::string &base, - std::string &filename_wo_ext, - std::string &ext) { - // base.clear(); - // filename_wo_ext.clear(); - // ext.clear(); - if (fullname.empty()) return; - auto dir_idx = fullname.rfind('/'); +namespace stringutils { + +void splitFilename(const std::string &filename, + std::string &base, + std::string &filename_wo_ext, + std::string &ext) { + if (filename.empty()) return; + auto dir_idx = filename.rfind('/'); std::string filename_w_ext; if (dir_idx != std::string::npos) { - base = fullname.substr(0, dir_idx + 1); - filename_w_ext = fullname.substr(dir_idx + 1); + base = filename.substr(0, dir_idx + 1); + filename_w_ext = filename.substr(dir_idx + 1); } else { base = "./"; - filename_w_ext = fullname; + filename_w_ext = filename; } auto ext_idx = filename_w_ext.rfind('.'); filename_wo_ext = filename_w_ext.substr(0, ext_idx); @@ -29,7 +27,7 @@ void splitFilename(const std::string &fullname, * ../syn/events2_U100_I10_T100_test.dat */ std::string insertMiddle(const std::string &filename, - const std::string &sufix) { + const std::string &sufix) { auto idx = filename.rfind('.'); if (idx != std::string::npos) { std::string extension = filename.substr(idx + 1); @@ -39,15 +37,29 @@ std::string insertMiddle(const std::string &filename, return filename + "_" + sufix; } -std::string getBasePath(const std::string &fullname) { - auto idx = fullname.rfind('/'); +std::string getBasePath(const std::string &filename) { + auto idx = filename.rfind('/'); if (idx != std::string::npos) { - return fullname.substr(0, idx + 1); + return filename.substr(0, idx + 1); } else { return "./"; } } +std::string joinPath(const std::string &parent, + const std::string &child) { + if (!parent.empty()) { + if (parent.back() == pathSeparator()) + return parent + child; + else { + std::string new_parent{parent}; + new_parent.push_back(pathSeparator()); + return new_parent + child; + } + } else + return joinPath("./", child); +} + std::string prettyNumber(const int num) { if (num < 1e3) return fmt::format("{}", num); @@ -84,16 +96,16 @@ std::string prettyTime(const double secs) { } } -void split(const std::string &s, char delim, +void split(const std::string &s, const char delim, std::vector &elems) { std::stringstream ss; ss.str(s); std::string item; - while (std::getline(ss, item, delim)) - elems.push_back(item); + while (std::getline(ss, item, delim)) elems.push_back(item); } -std::vector split(const std::string &s, char delim) { +std::vector split(const std::string &s, + const char delim) { std::vector elems; split(s, delim, elems); return elems; diff --git a/stringutils.h b/stringutils.h index 6a99d8b..bb7bf95 100644 --- a/stringutils.h +++ b/stringutils.h @@ -7,22 +7,41 @@ #define FMT_HEADER_ONLY #include -namespace strutils { +namespace stringutils { + +// ~/workspace/test/cmp.sh +// base: ~/workspace/test/ +// filename_wo_ext: cmp +// ext: .sh void splitFilename(const std::string &fullname, - std::string &base, - std::string &filename_wo_ext, - std::string &ext); + std::string &base, + std::string &fnm_wo_ext, + std::string &ext); /** * ../syn/events2_U100_I10_T100.dat --> * ../syn/events2_U100_I10_T100_test.dat */ std::string insertMiddle(const std::string &filename, - const std::string &sufix); + const std::string &sufix); +/** + * return the base path of fullname, with '/' at the tail + */ std::string getBasePath(const std::string &fullname); +inline char pathSeparator() { +#ifdef _WIN32 + return '\\'; +#else + return '/'; +#endif +} + +std::string joinPath(const std::string& parent, + const std::string& child); + std::string prettyNumber(const int num); static const double Kilobytes = 1 << 10, Megabytes = 1 << 20, @@ -34,6 +53,7 @@ std::string prettyTime(const double seconds); void split(const std::string &s, char delim, std::vector &elems); -std::vector split(const std::string &s, char delim); +std::vector split(const std::string &s, + const char delim); } #endif /* __STRINGUTILS_H__ */ diff --git a/timer.h b/timer.h index 23c9576..5d3041f 100644 --- a/timer.h +++ b/timer.h @@ -3,7 +3,6 @@ #include #include - #include #include @@ -18,27 +17,29 @@ class Timer { double milliseconds() const { auto&& dif = std::chrono::steady_clock::now() - last_tick; - return std::chrono::duration(dif).count(); + return std::chrono::duration(dif) + .count(); } double seconds() const { return milliseconds() / 1000; } const std::string getStr() const { - return strutils::prettyTime(seconds()); + return stringutils::prettyTime(seconds()); } static char* curTime() { std::time_t t = std::time(NULL); static char buf[100]; - std::strftime(buf, sizeof(buf), "%F %T", std::localtime(&t)); + std::strftime(buf, sizeof(buf), "%F %T", + std::localtime(&t)); return buf; } static int timestamp() { auto dur = std::chrono::steady_clock::now().time_since_epoch(); - return std::chrono::duration_cast( - dur) + return std::chrono::duration_cast< + std::chrono::milliseconds>(dur) .count(); } }; diff --git a/tsv_parser.cpp b/tsv_parser.cpp new file mode 100644 index 0000000..53628ce --- /dev/null +++ b/tsv_parser.cpp @@ -0,0 +1,236 @@ +///////////////////////////////////////////////// +// Fast-Spread-Sheet-Parser +TSVParser::TSVParser(const TStr& FNm, const bool& Silent, + const TSsFmt _SsFmt, + const bool& _SkipLeadBlanks, + const bool& _SkipCmt, + const bool& _SkipEmptyFld) + : SsFmt(_SsFmt), SkipLeadBlanks(_SkipLeadBlanks), + SkipCmt(_SkipCmt), SkipEmptyFld(_SkipEmptyFld), + LineCnt(0), SplitCh('\t'), FldV(), FInPt(NULL), + Silent(Silent) { + if (TZipIn::IsZipExt(FNm.GetFExt())) + FInPt = TZipIn::New(FNm, Silent); + else + FInPt = TFIn::New(FNm); + + // Bf = new char [BfLen]; + switch (SsFmt) { + case ssfTabSep: + SplitCh = '\t'; + break; + case ssfCommaSep: + SplitCh = ','; + break; + case ssfSemicolonSep: + SplitCh = ';'; + break; + case ssfSpaceSep: + SplitCh = ' '; + break; + case ssfWhiteSep: + SplitCh = ' '; + break; + default: + FailR("Unknown separator character."); + break; + } +} + +TSVParser::TSVParser(const TStr& FNm, const char& Separator, + const bool& Silent, + const bool& _SkipLeadBlanks, + const bool& _SkipCmt, + const bool& _SkipEmptyFld) + : SsFmt(ssfSpaceSep), SkipLeadBlanks(_SkipLeadBlanks), + SkipCmt(_SkipCmt), SkipEmptyFld(_SkipEmptyFld), + LineCnt(0), /*Bf(NULL),*/ SplitCh('\t'), FldV(), + FInPt(NULL), Silent(Silent) { + if (TZipIn::IsZipExt(FNm.GetFExt())) + FInPt = TZipIn::New(FNm, Silent); + else + FInPt = TFIn::New(FNm); + SplitCh = Separator; +} + +TSVParser::~TSVParser() { + // if (Bf != NULL) { delete [] Bf; } +} + +/*bool TSVParser::Next() { // split on SplitCh + const char* EndBf = Bf+BfLen-1; + memset(Bf, 0, BfLen); + char *cur = Bf, *last = Bf; + FldV.Clr(false); + TSIn& FIn = *FInPt; + if (SkipLeadBlanks) { // skip leadning blanks + while (! FIn.Eof() && cur < EndBf && (FIn.PeekCh()=='\t' || + FIn.PeekCh()==' ')) { FIn.GetCh(); } + } + while (! FIn.Eof() && cur < EndBf) { + if (SsFmt == ssfWhiteSep) { + while (! FIn.Eof() && cur < EndBf && ! + TCh::IsWs(*cur=FIn.GetCh())) { cur++; } + } else { + while (! FIn.Eof() && cur < EndBf && + (*cur=FIn.GetCh())!=SplitCh && *cur!='\r' && *cur!='\n') { + cur++; } + } + if (*cur=='\r' || *cur=='\n') { + *cur = 0; cur++; + if (*last) { FldV.Add(last); } + last = cur; + break; + } + *cur = 0; cur++; + FldV.Add(last); last = cur; + if (SkipEmpty && strlen(FldV.Last())==0) { FldV.DelLast(); } + } + if (SkipEmpty && FldV.Len()>0 && strlen(FldV.Last())==0) { + FldV.DelLast(); + } + LineCnt++; + if (! FldV.Empty() && cur < EndBf) { + if (SkipCmt && IsCmt()) { return Next(); } + else { return true; } } + else if (! FIn.Eof() && ! SkipEmpty) { return true; } + else { return false; } + }*/ + +bool TSVParser::Next() { // split on SplitCh + FldV.Clr(false); + LineStr.Clr(); + FldV.Clr(); + LineCnt++; + if (!FInPt->GetNextLn(LineStr)) return false; + if (SkipCmt && LineStr.Len() > 0 && LineStr[0] == '#') + return Next(); + + char* cur = LineStr.CStr(); + if (SkipLeadBlanks) { // skip leadning blanks + while (*cur && TCh::IsWs(*cur)) cur++; + } + char* last = cur; + while (*cur) { + if (SsFmt == ssfWhiteSep) { + while (*cur && !TCh::IsWs(*cur)) cur++; + } else { + while (*cur && *cur != SplitCh) cur++; + } + if (*cur == 0) break; + + *cur = 0; + cur++; + FldV.Add(last); + last = cur; + if (SkipEmptyFld && strlen(FldV.Last()) == 0) + FldV.DelLast(); // skip empty fields + } + FldV.Add(last); // add last field + if (SkipEmptyFld && FldV.Empty()) + return Next(); // skip empty lines + return true; + + // const char* EndBf = Bf+BfLen-1; + // memset(Bf, 0, BfLen); + // char *cur = Bf, *last = Bf; + /*if (SkipLeadBlanks) { // skip leadning blanks + while (! FIn.Eof() && cur < EndBf && (FIn.PeekCh()=='\t' + || FIn.PeekCh()==' ')) { FIn.GetCh(); } + } + while (! FIn.Eof() && cur < EndBf) { + if (SsFmt == ssfWhiteSep) { + while (! FIn.Eof() && cur < EndBf && ! + TCh::IsWs(*cur=FIn.GetCh())) { cur++; } + } else { + while (! FIn.Eof() && cur < EndBf && + (*cur=FIn.GetCh())!=SplitCh && *cur!='\r' && *cur!='\n') { + cur++; } + } + if (*cur=='\r' || *cur=='\n') { + if (! FIn.Eof()) { // read the remaining of the line + if (*cur == '\r' && FIn.PeekCh()=='\n') { FIn.GetCh(); } + else if (*cur == '\n' && FIn.PeekCh()=='\r') { + FIn.GetCh(); } + } + *cur = 0; cur++; + FldV.Add(last); + last = cur; + break; + } + *cur = 0; cur++; + FldV.Add(last); last = cur; + if (SkipEmptyFld && strlen(FldV.Last())==0) { + FldV.DelLast(); } // skip empty fields + } + LineCnt++; + if (SkipCmt && IsCmt() && ! FIn.Eof()) { return Next(); } + if (FldV.Len() == 1 && strlen(FldV[0])==0) { FldV.Clr(); + return true; } + if (SkipEmptyFld && FldV.Len()>0 && + strlen(FldV.Last())==0) { FldV.DelLast(); } + return ! FIn.Eof() || ! FldV.Empty(); + //if (SkipEmptyFld && FldV.Empty() && ! FIn.Eof()) { + return Next(); } // skip empty line + */ +} + +void TSVParser::ToLc() { + for (int f = 0; f < FldV.Len(); f++) { + for (char* c = FldV[f]; *c; c++) *c = tolower(*c); + } +} + +bool TSVParser::GetInt(const int& FldN, int& Val) const { + // parsing format {ws} [+/-] +{ddd} + int _Val = -1; + bool Minus = false; + const char* c = GetFld(FldN); + while (TCh::IsWs(*c)) c++; + if (*c == '-') { + Minus = true; + c++; + } + if (!TCh::IsNum(*c)) return false; + _Val = TCh::GetNum(*c); + c++; + while (TCh::IsNum(*c)) { + _Val = 10 * _Val + TCh::GetNum(*c); + c++; + } + if (Minus) _Val = -_Val; + if (*c != 0) return false; + Val = _Val; + return true; +} + +bool TSVParser::GetFlt(const int& FldN, double& Val) const { + // parsing format {ws} [+/-] +{d} ([.]{d}) ([E|e] [+/-] + // +{d}) + const char* c = GetFld(FldN); + while (TCh::IsWs(*c)) c++; + if (*c == '+' || *c == '-') c++; + if (!TCh::IsNum(*c) && *c != '.') return false; + while (TCh::IsNum(*c)) c++; + if (*c == '.') { + c++; + while (TCh::IsNum(*c)) c++; + } + if (*c == 'e' || *c == 'E') { + c++; + if (*c == '+' || *c == '-') c++; + if (!TCh::IsNum(*c)) return false; + while (TCh::IsNum(*c)) c++; + } + if (*c != 0) return false; + Val = atof(GetFld(FldN)); + return true; +} + +const char* TSVParser::DumpStr() const { + static TChA ChA(10 * 1024); + ChA.Clr(); + for (int i = 0; i < FldV.Len(); i++) + ChA += TStr::Fmt(" %d: '%s'\n", i, FldV[i]); + return ChA.CStr(); +} diff --git a/tsv_parser.h b/tsv_parser.h new file mode 100644 index 0000000..b6dfc87 --- /dev/null +++ b/tsv_parser.h @@ -0,0 +1,73 @@ +#ifndef IOUTILS_H +#define IOUTILS_H + +class TSVParser { +private: + TSsFmt SsFmt; + bool SkipLeadBlanks, SkipCmt, SkipEmptyFld; + uint64 LineCnt; + char SplitCh; + TChA LineStr; + TVec FldV; + PSIn FInPt; + bool Silent; + +public: + TSVParser(const TStr& FNm, const bool& Silent = true, + const TSsFmt _SsFmt = ssfTabSep, + const bool& _SkipLeadBlanks = true, + const bool& _SkipCmt = true, + const bool& _SkipEmptyFld = true); + TSVParser(const TStr& FNm, const TSsFmt _SsFmt) + : TSVParser(FNm, true, _SsFmt) {} + TSVParser(const TStr& FNm, const char& Separator, + const bool& Silent = true, + const bool& _SkipLeadBlanks = false, + const bool& _SkipCmt = true, + const bool& _SkipEmptyFld = false); + ~TSVParser(); + bool Next(); + int Len() const { return FldV.Len(); } + int GetFlds() const { return Len(); } + uint64 GetLineNo() const { return LineCnt; } + bool IsCmt() const { + return Len() > 0 && GetFld(0)[0] == '#'; + } + bool Eof() const { return FInPt->Eof(); } + const TChA& GetLnStr() const { return LineStr; } + void ToLc(); + const char* GetFld(const int& FldN) const { + return FldV[FldN]; + } + char* GetFld(const int& FldN) { return FldV[FldN]; } + const char* operator[](const int& FldN) const { + return FldV[FldN]; + } + char* operator[](const int& FldN) { return FldV[FldN]; } + bool GetInt(const int& FldN, int& Val) const; + int GetInt(const int& FldN) const { + int Val = 0; + IAssertR( + GetInt(FldN, Val), + TStr::Fmt("Field %d not INT.\n%s", FldN, DumpStr()) + .CStr()); + return Val; + } + bool IsInt(const int& FldN) const { + int v; + return GetInt(FldN, v); + } + bool GetFlt(const int& FldN, double& Val) const; + bool IsFlt(const int& FldN) const { + double v; + return GetFlt(FldN, v); + } + double GetFlt(const int& FldN) const { + double Val = 0.0; + IAssert(GetFlt(FldN, Val)); + return Val; + } + const char* DumpStr() const; +}; + +#endif