From 65e9022f51de07701d1de0032a8fdd25a2389af5 Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Thu, 24 Oct 2024 12:05:26 +0200 Subject: [PATCH] remove usage of non-standard types in std::char_traits --- extras/rapidfuzz_amalgamated.hpp | 133 ++++++++++++------ fuzzing/fuzz_damerau_levenshtein_distance.cpp | 10 +- fuzzing/fuzz_indel_distance.cpp | 5 +- fuzzing/fuzz_indel_editops.cpp | 4 +- fuzzing/fuzz_jaro_similarity.cpp | 12 +- fuzzing/fuzz_lcs_similarity.cpp | 6 +- fuzzing/fuzz_levenshtein_distance.cpp | 10 +- fuzzing/fuzz_levenshtein_editops.cpp | 12 +- fuzzing/fuzz_osa_distance.cpp | 10 +- fuzzing/fuzzing.hpp | 18 +-- rapidfuzz/distance.hpp | 99 +++++++++---- rapidfuzz/fuzz.hpp | 8 +- test/common.hpp | 2 +- test/distance/examples/ocr.cpp | 4 +- test/distance/examples/ocr.hpp | 6 +- .../examples/pythonLevenshteinIssue9.cpp | 4 +- .../examples/pythonLevenshteinIssue9.hpp | 6 +- test/distance/tests-Hamming.cpp | 4 +- test/distance/tests-Indel.cpp | 2 +- test/distance/tests-Levenshtein.cpp | 36 ++--- 20 files changed, 246 insertions(+), 145 deletions(-) diff --git a/extras/rapidfuzz_amalgamated.hpp b/extras/rapidfuzz_amalgamated.hpp index e8978d0d..ee77bb3e 100644 --- a/extras/rapidfuzz_amalgamated.hpp +++ b/extras/rapidfuzz_amalgamated.hpp @@ -1,7 +1,7 @@ // Licensed under the MIT License . // SPDX-License-Identifier: MIT // RapidFuzz v1.0.2 -// Generated: 2024-07-02 16:47:26.932914 +// Generated: 2024-10-24 12:06:59.588890 // ---------------------------------------------------------- // This file is an amalgamation of multiple different files. // You probably shouldn't edit it directly. @@ -4511,8 +4511,8 @@ void lcs_simd(Range scores, const BlockPatternMatchVector& block, const #endif template -auto lcs_unroll(const PMV& block, const Range&, const Range& s2, size_t score_cutoff = 0) - -> LCSseqResult +auto lcs_unroll(const PMV& block, const Range&, const Range& s2, + size_t score_cutoff = 0) -> LCSseqResult { uint64_t S[N]; unroll([&](size_t i) { S[i] = ~UINT64_C(0); }); @@ -6662,12 +6662,12 @@ struct CachedJaroWinkler : public detail::CachedSimilarityBase -explicit CachedJaroWinkler(const Sentence1& s1_, double _prefix_weight = 0.1) - -> CachedJaroWinkler>; +explicit CachedJaroWinkler(const Sentence1& s1_, + double _prefix_weight = 0.1) -> CachedJaroWinkler>; template -CachedJaroWinkler(InputIt1 first1, InputIt1 last1, double _prefix_weight = 0.1) - -> CachedJaroWinkler>; +CachedJaroWinkler(InputIt1 first1, InputIt1 last1, + double _prefix_weight = 0.1) -> CachedJaroWinkler>; } // namespace rapidfuzz @@ -7135,8 +7135,8 @@ size_t levenshtein_hyrroe2003_small_band(const BlockPatternMatchVector& PM, cons } template -auto levenshtein_hyrroe2003_small_band(const Range& s1, const Range& s2, size_t max) - -> LevenshteinResult +auto levenshtein_hyrroe2003_small_band(const Range& s1, const Range& s2, + size_t max) -> LevenshteinResult { assert(max <= s1.size()); assert(max <= s2.size()); @@ -8358,12 +8358,12 @@ struct CachedLevenshtein : public detail::CachedDistanceBase -explicit CachedLevenshtein(const Sentence1& s1_, LevenshteinWeightTable aWeights = {1, 1, 1}) - -> CachedLevenshtein>; +explicit CachedLevenshtein(const Sentence1& s1_, LevenshteinWeightTable aWeights = { + 1, 1, 1}) -> CachedLevenshtein>; template -CachedLevenshtein(InputIt1 first1, InputIt1 last1, LevenshteinWeightTable aWeights = {1, 1, 1}) - -> CachedLevenshtein>; +CachedLevenshtein(InputIt1 first1, InputIt1 last1, + LevenshteinWeightTable aWeights = {1, 1, 1}) -> CachedLevenshtein>; } // namespace rapidfuzz @@ -9151,14 +9151,15 @@ CachedPrefix(InputIt1 first1, InputIt1 last1) -> CachedPrefix -std::basic_string editops_apply(const Editops& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, - InputIt2 last2) +namespace detail { +template +ReturnType editops_apply_impl(const Editops& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2) { auto len1 = static_cast(std::distance(first1, last1)); auto len2 = static_cast(std::distance(first2, last2)); - std::basic_string res_str; + ReturnType res_str; res_str.resize(len1 + len2); size_t src_pos = 0; size_t dest_pos = 0; @@ -9166,7 +9167,8 @@ std::basic_string editops_apply(const Editops& ops, InputIt1 first1, Inpu for (const auto& op : ops) { /* matches between last and current editop */ while (src_pos < op.src_pos) { - res_str[dest_pos] = static_cast(first1[static_cast(src_pos)]); + res_str[dest_pos] = + static_cast(first1[static_cast(src_pos)]); src_pos++; dest_pos++; } @@ -9174,12 +9176,14 @@ std::basic_string editops_apply(const Editops& ops, InputIt1 first1, Inpu switch (op.type) { case EditType::None: case EditType::Replace: - res_str[dest_pos] = static_cast(first2[static_cast(op.dest_pos)]); + res_str[dest_pos] = + static_cast(first2[static_cast(op.dest_pos)]); src_pos++; dest_pos++; break; case EditType::Insert: - res_str[dest_pos] = static_cast(first2[static_cast(op.dest_pos)]); + res_str[dest_pos] = + static_cast(first2[static_cast(op.dest_pos)]); dest_pos++; break; case EditType::Delete: src_pos++; break; @@ -9188,7 +9192,8 @@ std::basic_string editops_apply(const Editops& ops, InputIt1 first1, Inpu /* matches after the last editop */ while (src_pos < len1) { - res_str[dest_pos] = static_cast(first1[static_cast(src_pos)]); + res_str[dest_pos] = + static_cast(first1[static_cast(src_pos)]); src_pos++; dest_pos++; } @@ -9197,21 +9202,14 @@ std::basic_string editops_apply(const Editops& ops, InputIt1 first1, Inpu return res_str; } -template -std::basic_string editops_apply(const Editops& ops, const Sentence1& s1, const Sentence2& s2) -{ - return editops_apply(ops, detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), - detail::to_end(s2)); -} - -template -std::basic_string opcodes_apply(const Opcodes& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, - InputIt2 last2) +template +ReturnType opcodes_apply_impl(const Opcodes& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2) { auto len1 = static_cast(std::distance(first1, last1)); auto len2 = static_cast(std::distance(first2, last2)); - std::basic_string res_str; + ReturnType res_str; res_str.resize(len1 + len2); size_t dest_pos = 0; @@ -9219,13 +9217,15 @@ std::basic_string opcodes_apply(const Opcodes& ops, InputIt1 first1, Inpu switch (op.type) { case EditType::None: for (auto i = op.src_begin; i < op.src_end; ++i) { - res_str[dest_pos++] = static_cast(first1[static_cast(i)]); + res_str[dest_pos++] = + static_cast(first1[static_cast(i)]); } break; case EditType::Replace: case EditType::Insert: for (auto i = op.dest_begin; i < op.dest_end; ++i) { - res_str[dest_pos++] = static_cast(first2[static_cast(i)]); + res_str[dest_pos++] = + static_cast(first2[static_cast(i)]); } break; case EditType::Delete: break; @@ -9236,11 +9236,62 @@ std::basic_string opcodes_apply(const Opcodes& ops, InputIt1 first1, Inpu return res_str; } +} // namespace detail + +template +std::basic_string editops_apply_str(const Editops& ops, InputIt1 first1, InputIt1 last1, + InputIt2 first2, InputIt2 last2) +{ + return detail::editops_apply_impl>(ops, first1, last1, first2, last2); +} + +template +std::basic_string editops_apply_str(const Editops& ops, const Sentence1& s1, const Sentence2& s2) +{ + return detail::editops_apply_impl>(ops, detail::to_begin(s1), detail::to_end(s1), + detail::to_begin(s2), detail::to_end(s2)); +} + +template +std::basic_string opcodes_apply_str(const Opcodes& ops, InputIt1 first1, InputIt1 last1, + InputIt2 first2, InputIt2 last2) +{ + return detail::opcodes_apply_impl>(ops, first1, last1, first2, last2); +} + +template +std::basic_string opcodes_apply_str(const Opcodes& ops, const Sentence1& s1, const Sentence2& s2) +{ + return detail::opcodes_apply_impl>(ops, detail::to_begin(s1), detail::to_end(s1), + detail::to_begin(s2), detail::to_end(s2)); +} + +template +std::vector editops_apply_vec(const Editops& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2) +{ + return detail::editops_apply_impl>(ops, first1, last1, first2, last2); +} + +template +std::vector editops_apply_vec(const Editops& ops, const Sentence1& s1, const Sentence2& s2) +{ + return detail::editops_apply_impl>(ops, detail::to_begin(s1), detail::to_end(s1), + detail::to_begin(s2), detail::to_end(s2)); +} + +template +std::vector opcodes_apply_vec(const Opcodes& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2) +{ + return detail::opcodes_apply_impl>(ops, first1, last1, first2, last2); +} + template -std::basic_string opcodes_apply(const Opcodes& ops, const Sentence1& s1, const Sentence2& s2) +std::vector opcodes_apply_vec(const Opcodes& ops, const Sentence1& s1, const Sentence2& s2) { - return opcodes_apply(ops, detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), - detail::to_end(s2)); + return detail::opcodes_apply_impl>(ops, detail::to_begin(s1), detail::to_end(s1), + detail::to_begin(s2), detail::to_end(s2)); } } // namespace rapidfuzz @@ -9669,8 +9720,8 @@ explicit CachedPartialTokenSortRatio(const Sentence1& s1) -> CachedPartialTokenSortRatio>; template -CachedPartialTokenSortRatio(InputIt1 first1, InputIt1 last1) - -> CachedPartialTokenSortRatio>; +CachedPartialTokenSortRatio(InputIt1 first1, + InputIt1 last1) -> CachedPartialTokenSortRatio>; /** * @brief Compares the words in the strings based on unique and common words @@ -9793,8 +9844,8 @@ template explicit CachedPartialTokenSetRatio(const Sentence1& s1) -> CachedPartialTokenSetRatio>; template -CachedPartialTokenSetRatio(InputIt1 first1, InputIt1 last1) - -> CachedPartialTokenSetRatio>; +CachedPartialTokenSetRatio(InputIt1 first1, + InputIt1 last1) -> CachedPartialTokenSetRatio>; /** * @brief Helper method that returns the maximum of fuzz::token_set_ratio and diff --git a/fuzzing/fuzz_damerau_levenshtein_distance.cpp b/fuzzing/fuzz_damerau_levenshtein_distance.cpp index b1066168..743cf74a 100644 --- a/fuzzing/fuzz_damerau_levenshtein_distance.cpp +++ b/fuzzing/fuzz_damerau_levenshtein_distance.cpp @@ -8,8 +8,8 @@ #include #include -void validate_distance(size_t reference_dist, const std::basic_string& s1, - const std::basic_string& s2, size_t score_cutoff) +void validate_distance(size_t reference_dist, const std::vector& s1, const std::vector& s2, + size_t score_cutoff) { if (reference_dist > score_cutoff) reference_dist = score_cutoff + 1; @@ -26,7 +26,7 @@ void validate_distance(size_t reference_dist, const std::basic_string& extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - std::basic_string s1, s2; + std::vector s1, s2; if (!extract_strings(data, size, s1, s2)) return 0; size_t reference_dist = rapidfuzz_reference::damerau_levenshtein_distance(s1, s2); @@ -40,8 +40,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) /* test long sequences */ for (unsigned int i = 2; i < 9; ++i) { - std::basic_string s1_ = str_multiply(s1, pow(2, i)); - std::basic_string s2_ = str_multiply(s2, pow(2, i)); + std::vector s1_ = vec_multiply(s1, pow(2, i)); + std::vector s2_ = vec_multiply(s2, pow(2, i)); if (s1_.size() > 10000 || s2_.size() > 10000) break; diff --git a/fuzzing/fuzz_indel_distance.cpp b/fuzzing/fuzz_indel_distance.cpp index 88b8ade1..546bcd14 100644 --- a/fuzzing/fuzz_indel_distance.cpp +++ b/fuzzing/fuzz_indel_distance.cpp @@ -8,8 +8,7 @@ #include #include -void validate_distance(const std::basic_string& s1, const std::basic_string& s2, - size_t score_cutoff) +void validate_distance(const std::vector& s1, const std::vector& s2, size_t score_cutoff) { auto dist = rapidfuzz::indel_distance(s1, s2, score_cutoff); auto reference_dist = rapidfuzz_reference::indel_distance(s1, s2, score_cutoff); @@ -25,7 +24,7 @@ void validate_distance(const std::basic_string& s1, const std::basic_st extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - std::basic_string s1, s2; + std::vector s1, s2; if (!extract_strings(data, size, s1, s2)) return 0; validate_distance(s1, s2, 0); diff --git a/fuzzing/fuzz_indel_editops.cpp b/fuzzing/fuzz_indel_editops.cpp index 07cdf85e..4d99c300 100644 --- a/fuzzing/fuzz_indel_editops.cpp +++ b/fuzzing/fuzz_indel_editops.cpp @@ -9,13 +9,13 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - std::basic_string s1, s2; + std::vector s1, s2; if (!extract_strings(data, size, s1, s2)) return 0; size_t score = rapidfuzz_reference::indel_distance(s1, s2); rapidfuzz::Editops ops = rapidfuzz::indel_editops(s1, s2); - if (ops.size() == score && s2 != rapidfuzz::editops_apply(ops, s1, s2)) + if (ops.size() == score && s2 != rapidfuzz::editops_apply_vec(ops, s1, s2)) throw std::logic_error("levenshtein_editops failed"); return 0; diff --git a/fuzzing/fuzz_jaro_similarity.cpp b/fuzzing/fuzz_jaro_similarity.cpp index 1ac257b2..6ba5a046 100644 --- a/fuzzing/fuzz_jaro_similarity.cpp +++ b/fuzzing/fuzz_jaro_similarity.cpp @@ -14,7 +14,7 @@ bool is_close(double a, double b, double epsilon) } template -void validate_simd(const std::basic_string& s1, const std::basic_string& s2) +void validate_simd(const std::vector& s1, const std::vector& s2) { #ifdef RAPIDFUZZ_SIMD size_t count = s1.size() / MaxLen + ((s1.size() % MaxLen) != 0); @@ -22,7 +22,7 @@ void validate_simd(const std::basic_string& s1, const std::basic_string rapidfuzz::experimental::MultiJaro scorer(count); - std::vector> strings; + std::vector> strings; for (auto it1 = s1.begin(); it1 != s1.end(); it1 += MaxLen) { if (std::distance(it1, s1.end()) < static_cast(MaxLen)) { @@ -59,7 +59,7 @@ void validate_simd(const std::basic_string& s1, const std::basic_string #endif } -void validate_distance(const std::basic_string& s1, const std::basic_string& s2) +void validate_distance(const std::vector& s1, const std::vector& s2) { double reference_sim = rapidfuzz_reference::jaro_similarity(s1, s2); double sim = rapidfuzz::jaro_similarity(s1, s2); @@ -80,15 +80,15 @@ void validate_distance(const std::basic_string& s1, const std::basic_st extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - std::basic_string s1, s2; + std::vector s1, s2; if (!extract_strings(data, size, s1, s2)) return 0; validate_distance(s1, s2); /* test long sequences */ for (unsigned int i = 2; i < 9; ++i) { - std::basic_string s1_ = str_multiply(s1, pow(2, i)); - std::basic_string s2_ = str_multiply(s2, pow(2, i)); + std::vector s1_ = vec_multiply(s1, pow(2, i)); + std::vector s2_ = vec_multiply(s2, pow(2, i)); if (s1_.size() > 10000 || s2_.size() > 10000) break; diff --git a/fuzzing/fuzz_lcs_similarity.cpp b/fuzzing/fuzz_lcs_similarity.cpp index 5a82120f..7f833c51 100644 --- a/fuzzing/fuzz_lcs_similarity.cpp +++ b/fuzzing/fuzz_lcs_similarity.cpp @@ -9,13 +9,13 @@ #include template -void validate_simd(const std::basic_string& s1, const std::basic_string& s2) +void validate_simd(const std::vector& s1, const std::vector& s2) { #ifdef RAPIDFUZZ_SIMD size_t count = s1.size() / MaxLen + ((s1.size() % MaxLen) != 0); rapidfuzz::experimental::MultiLCSseq scorer(count); - std::vector> strings; + std::vector> strings; for (auto it1 = s1.begin(); it1 != s1.end(); it1 += MaxLen) { if (std::distance(it1, s1.end()) < static_cast(MaxLen)) { @@ -51,7 +51,7 @@ void validate_simd(const std::basic_string& s1, const std::basic_string extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - std::basic_string s1, s2; + std::vector s1, s2; if (!extract_strings(data, size, s1, s2)) { return 0; } diff --git a/fuzzing/fuzz_levenshtein_distance.cpp b/fuzzing/fuzz_levenshtein_distance.cpp index a595b8b9..a577c608 100644 --- a/fuzzing/fuzz_levenshtein_distance.cpp +++ b/fuzzing/fuzz_levenshtein_distance.cpp @@ -9,7 +9,7 @@ #include template -void validate_simd(const std::basic_string& s1, const std::basic_string& s2) +void validate_simd(const std::vector& s1, const std::vector& s2) { #ifdef RAPIDFUZZ_SIMD size_t count = s1.size() / MaxLen + ((s1.size() % MaxLen) != 0); @@ -17,7 +17,7 @@ void validate_simd(const std::basic_string& s1, const std::basic_string rapidfuzz::experimental::MultiLevenshtein scorer(count); - std::vector> strings; + std::vector> strings; for (auto it1 = s1.begin(); it1 != s1.end(); it1 += MaxLen) { if (std::distance(it1, s1.end()) < static_cast(MaxLen)) { @@ -52,8 +52,8 @@ void validate_simd(const std::basic_string& s1, const std::basic_string #endif } -void validate_distance(size_t reference_dist, const std::basic_string& s1, - const std::basic_string& s2, size_t score_cutoff) +void validate_distance(size_t reference_dist, const std::vector& s1, const std::vector& s2, + size_t score_cutoff) { if (reference_dist > score_cutoff) reference_dist = score_cutoff + 1; @@ -75,7 +75,7 @@ void validate_distance(size_t reference_dist, const std::basic_string& extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - std::basic_string s1, s2; + std::vector s1, s2; if (!extract_strings(data, size, s1, s2)) return 0; size_t reference_dist = rapidfuzz_reference::levenshtein_distance(s1, s2); diff --git a/fuzzing/fuzz_levenshtein_editops.cpp b/fuzzing/fuzz_levenshtein_editops.cpp index 596caaaa..fe09cb5e 100644 --- a/fuzzing/fuzz_levenshtein_editops.cpp +++ b/fuzzing/fuzz_levenshtein_editops.cpp @@ -7,17 +7,17 @@ #include #include -void validate_editops(const std::basic_string& s1, const std::basic_string& s2, - size_t score, size_t score_hint = std::numeric_limits::max()) +void validate_editops(const std::vector& s1, const std::vector& s2, size_t score, + size_t score_hint = std::numeric_limits::max()) { rapidfuzz::Editops ops = rapidfuzz::levenshtein_editops(s1, s2, score_hint); - if (ops.size() == score && s2 != rapidfuzz::editops_apply(ops, s1, s2)) + if (ops.size() == score && s2 != rapidfuzz::editops_apply_vec(ops, s1, s2)) throw std::logic_error("levenshtein_editops failed"); } extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - std::basic_string s1, s2; + std::vector s1, s2; if (!extract_strings(data, size, s1, s2)) return 0; /* hirschbergs algorithm is only used for very long sequences which are apparently not generated a lot by @@ -36,8 +36,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) throw std::logic_error("find_hirschberg_pos failed"); } - s1 = str_multiply(s1, 2); - s2 = str_multiply(s2, 2); + s1 = vec_multiply(s1, 2); + s2 = vec_multiply(s2, 2); } return 0; diff --git a/fuzzing/fuzz_osa_distance.cpp b/fuzzing/fuzz_osa_distance.cpp index 0fa9f977..5cd75020 100644 --- a/fuzzing/fuzz_osa_distance.cpp +++ b/fuzzing/fuzz_osa_distance.cpp @@ -8,8 +8,8 @@ #include #include -void validate_distance(size_t reference_dist, const std::basic_string& s1, - const std::basic_string& s2, size_t score_cutoff) +void validate_distance(size_t reference_dist, const std::vector& s1, const std::vector& s2, + size_t score_cutoff) { if (reference_dist > score_cutoff) reference_dist = score_cutoff + 1; @@ -26,7 +26,7 @@ void validate_distance(size_t reference_dist, const std::basic_string& extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - std::basic_string s1, s2; + std::vector s1, s2; if (!extract_strings(data, size, s1, s2)) return 0; size_t reference_dist = rapidfuzz_reference::osa_distance(s1, s2); @@ -40,8 +40,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) /* test long sequences */ for (unsigned int i = 2; i < 9; ++i) { - std::basic_string s1_ = str_multiply(s1, pow(2, i)); - std::basic_string s2_ = str_multiply(s2, pow(2, i)); + std::vector s1_ = vec_multiply(s1, pow(2, i)); + std::vector s2_ = vec_multiply(s2, pow(2, i)); if (s1_.size() > 10000 || s2_.size() > 10000) break; diff --git a/fuzzing/fuzzing.hpp b/fuzzing/fuzzing.hpp index e77a58b5..282baf98 100644 --- a/fuzzing/fuzzing.hpp +++ b/fuzzing/fuzzing.hpp @@ -1,10 +1,10 @@ #pragma once #include #include -#include +#include -static inline bool extract_strings(const uint8_t* data, size_t size, std::basic_string& s1, - std::basic_string& s2) +static inline bool extract_strings(const uint8_t* data, size_t size, std::vector& s1, + std::vector& s2) { if (size <= sizeof(uint32_t)) { return false; @@ -17,8 +17,8 @@ static inline bool extract_strings(const uint8_t* data, size_t size, std::basic_ data += sizeof(len1); size -= sizeof(len1); - s1 = std::basic_string(data, len1); - s2 = std::basic_string(data + len1, size - len1); + s1 = std::vector(data, data + len1); + s2 = std::vector(data + len1, data + size); return true; } @@ -36,17 +36,17 @@ static inline T pow(T x, unsigned int p) } template -std::basic_string str_multiply(std::basic_string a, size_t b) +std::vector vec_multiply(const std::vector& a, size_t b) { - std::basic_string output; + std::vector output; while (b--) - output += a; + output.insert(output.end(), a.begin(), a.end()); return output; } template -void print_seq(const std::string& name, const std::basic_string& seq) +void print_seq(const std::string& name, const std::vector& seq) { std::cout << name << " len: " << seq.size() << " content: "; for (const auto& ch : seq) diff --git a/rapidfuzz/distance.hpp b/rapidfuzz/distance.hpp index 75bdb732..da686f7b 100644 --- a/rapidfuzz/distance.hpp +++ b/rapidfuzz/distance.hpp @@ -15,14 +15,15 @@ namespace rapidfuzz { -template -std::basic_string editops_apply(const Editops& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, - InputIt2 last2) +namespace detail { +template +ReturnType editops_apply_impl(const Editops& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2) { auto len1 = static_cast(std::distance(first1, last1)); auto len2 = static_cast(std::distance(first2, last2)); - std::basic_string res_str; + ReturnType res_str; res_str.resize(len1 + len2); size_t src_pos = 0; size_t dest_pos = 0; @@ -30,7 +31,8 @@ std::basic_string editops_apply(const Editops& ops, InputIt1 first1, Inpu for (const auto& op : ops) { /* matches between last and current editop */ while (src_pos < op.src_pos) { - res_str[dest_pos] = static_cast(first1[static_cast(src_pos)]); + res_str[dest_pos] = + static_cast(first1[static_cast(src_pos)]); src_pos++; dest_pos++; } @@ -38,12 +40,14 @@ std::basic_string editops_apply(const Editops& ops, InputIt1 first1, Inpu switch (op.type) { case EditType::None: case EditType::Replace: - res_str[dest_pos] = static_cast(first2[static_cast(op.dest_pos)]); + res_str[dest_pos] = + static_cast(first2[static_cast(op.dest_pos)]); src_pos++; dest_pos++; break; case EditType::Insert: - res_str[dest_pos] = static_cast(first2[static_cast(op.dest_pos)]); + res_str[dest_pos] = + static_cast(first2[static_cast(op.dest_pos)]); dest_pos++; break; case EditType::Delete: src_pos++; break; @@ -52,7 +56,8 @@ std::basic_string editops_apply(const Editops& ops, InputIt1 first1, Inpu /* matches after the last editop */ while (src_pos < len1) { - res_str[dest_pos] = static_cast(first1[static_cast(src_pos)]); + res_str[dest_pos] = + static_cast(first1[static_cast(src_pos)]); src_pos++; dest_pos++; } @@ -61,21 +66,14 @@ std::basic_string editops_apply(const Editops& ops, InputIt1 first1, Inpu return res_str; } -template -std::basic_string editops_apply(const Editops& ops, const Sentence1& s1, const Sentence2& s2) -{ - return editops_apply(ops, detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), - detail::to_end(s2)); -} - -template -std::basic_string opcodes_apply(const Opcodes& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, - InputIt2 last2) +template +ReturnType opcodes_apply_impl(const Opcodes& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2) { auto len1 = static_cast(std::distance(first1, last1)); auto len2 = static_cast(std::distance(first2, last2)); - std::basic_string res_str; + ReturnType res_str; res_str.resize(len1 + len2); size_t dest_pos = 0; @@ -83,13 +81,15 @@ std::basic_string opcodes_apply(const Opcodes& ops, InputIt1 first1, Inpu switch (op.type) { case EditType::None: for (auto i = op.src_begin; i < op.src_end; ++i) { - res_str[dest_pos++] = static_cast(first1[static_cast(i)]); + res_str[dest_pos++] = + static_cast(first1[static_cast(i)]); } break; case EditType::Replace: case EditType::Insert: for (auto i = op.dest_begin; i < op.dest_end; ++i) { - res_str[dest_pos++] = static_cast(first2[static_cast(i)]); + res_str[dest_pos++] = + static_cast(first2[static_cast(i)]); } break; case EditType::Delete: break; @@ -100,11 +100,62 @@ std::basic_string opcodes_apply(const Opcodes& ops, InputIt1 first1, Inpu return res_str; } +} // namespace detail + +template +std::basic_string editops_apply_str(const Editops& ops, InputIt1 first1, InputIt1 last1, + InputIt2 first2, InputIt2 last2) +{ + return detail::editops_apply_impl>(ops, first1, last1, first2, last2); +} + +template +std::basic_string editops_apply_str(const Editops& ops, const Sentence1& s1, const Sentence2& s2) +{ + return detail::editops_apply_impl>(ops, detail::to_begin(s1), detail::to_end(s1), + detail::to_begin(s2), detail::to_end(s2)); +} + +template +std::basic_string opcodes_apply_str(const Opcodes& ops, InputIt1 first1, InputIt1 last1, + InputIt2 first2, InputIt2 last2) +{ + return detail::opcodes_apply_impl>(ops, first1, last1, first2, last2); +} + +template +std::basic_string opcodes_apply_str(const Opcodes& ops, const Sentence1& s1, const Sentence2& s2) +{ + return detail::opcodes_apply_impl>(ops, detail::to_begin(s1), detail::to_end(s1), + detail::to_begin(s2), detail::to_end(s2)); +} + +template +std::vector editops_apply_vec(const Editops& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2) +{ + return detail::editops_apply_impl>(ops, first1, last1, first2, last2); +} + +template +std::vector editops_apply_vec(const Editops& ops, const Sentence1& s1, const Sentence2& s2) +{ + return detail::editops_apply_impl>(ops, detail::to_begin(s1), detail::to_end(s1), + detail::to_begin(s2), detail::to_end(s2)); +} + +template +std::vector opcodes_apply_vec(const Opcodes& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2) +{ + return detail::opcodes_apply_impl>(ops, first1, last1, first2, last2); +} + template -std::basic_string opcodes_apply(const Opcodes& ops, const Sentence1& s1, const Sentence2& s2) +std::vector opcodes_apply_vec(const Opcodes& ops, const Sentence1& s1, const Sentence2& s2) { - return opcodes_apply(ops, detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), - detail::to_end(s2)); + return detail::opcodes_apply_impl>(ops, detail::to_begin(s1), detail::to_end(s1), + detail::to_begin(s2), detail::to_end(s2)); } } // namespace rapidfuzz diff --git a/rapidfuzz/fuzz.hpp b/rapidfuzz/fuzz.hpp index fbf7ae26..d303722c 100644 --- a/rapidfuzz/fuzz.hpp +++ b/rapidfuzz/fuzz.hpp @@ -361,8 +361,8 @@ explicit CachedPartialTokenSortRatio(const Sentence1& s1) -> CachedPartialTokenSortRatio>; template -CachedPartialTokenSortRatio(InputIt1 first1, InputIt1 last1) - -> CachedPartialTokenSortRatio>; +CachedPartialTokenSortRatio(InputIt1 first1, + InputIt1 last1) -> CachedPartialTokenSortRatio>; /** * @brief Compares the words in the strings based on unique and common words @@ -485,8 +485,8 @@ template explicit CachedPartialTokenSetRatio(const Sentence1& s1) -> CachedPartialTokenSetRatio>; template -CachedPartialTokenSetRatio(InputIt1 first1, InputIt1 last1) - -> CachedPartialTokenSetRatio>; +CachedPartialTokenSetRatio(InputIt1 first1, + InputIt1 last1) -> CachedPartialTokenSetRatio>; /** * @brief Helper method that returns the maximum of fuzz::token_set_ratio and diff --git a/test/common.hpp b/test/common.hpp index 238e2055..427f0906 100644 --- a/test/common.hpp +++ b/test/common.hpp @@ -58,7 +58,7 @@ class BidirectionalIterWrapper { T iter; }; -template +template >> std::basic_string str_multiply(std::basic_string a, size_t b) { std::basic_string output; diff --git a/test/distance/examples/ocr.cpp b/test/distance/examples/ocr.cpp index c1a024ad..96b9f5c2 100644 --- a/test/distance/examples/ocr.cpp +++ b/test/distance/examples/ocr.cpp @@ -1,6 +1,6 @@ #include "ocr.hpp" -std::basic_string ocr_example1 = { +std::vector ocr_example1 = { 22, 18, 27, 22, 8, 23, 23, 18, 29, 27, 8, 23, 28, 18, 29, 27, 8, 24, 18, 27, 31, 8, 24, 18, 29, 22, 8, 24, 24, 18, 31, 24, 8, 23, 24, 18, 25, 25, 8, 24, 26, 18, 30, 24, 8, 23, 26, 18, 25, 30, 8, 29, 11, 2, 22, 18, 27, 22, 8, 23, 23, 18, 29, @@ -5075,7 +5075,7 @@ std::basic_string ocr_example1 = { 27, 8, 29, 7, 8, 39, 61, 80, 8, 27, 28, 22, 21, 8, 65, 79, 68, 61, 72, 81, 65, 74, 2}; -std::basic_string ocr_example2 = { +std::vector ocr_example2 = { 22, 18, 27, 22, 8, 23, 23, 18, 29, 27, 8, 23, 28, 18, 29, 27, 8, 24, 18, 27, 31, 8, 24, 18, 29, 22, 8, 24, 24, 18, 31, 24, 8, 23, 24, 18, 25, 25, 8, 24, 26, 18, 30, 24, 8, 23, 26, 18, 25, 30, 11, 2, 22, 18, 27, 22, 8, 23, 23, 18, 29, 27, 8, diff --git a/test/distance/examples/ocr.hpp b/test/distance/examples/ocr.hpp index 77d4cc13..f277d54b 100644 --- a/test/distance/examples/ocr.hpp +++ b/test/distance/examples/ocr.hpp @@ -1,6 +1,6 @@ #pragma once #include -#include +#include -extern std::basic_string ocr_example1; -extern std::basic_string ocr_example2; +extern std::vector ocr_example1; +extern std::vector ocr_example2; diff --git a/test/distance/examples/pythonLevenshteinIssue9.cpp b/test/distance/examples/pythonLevenshteinIssue9.cpp index 171c58fa..fdd636f9 100644 --- a/test/distance/examples/pythonLevenshteinIssue9.cpp +++ b/test/distance/examples/pythonLevenshteinIssue9.cpp @@ -2,7 +2,7 @@ namespace pythonLevenshteinIssue9 { -std::basic_string example1 = { +std::vector example1 = { 8, 14, 4, 2, 3, 7, 15, 6, 4, 5, 8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 10, 11, 12, 13, 8, 2, 8, 14, 4, 2, 3, 7, 15, 6, 4, 5, 8, 6, 7, 16, 7, 13, 17, 2, 4, 16, 14, 7, 14, 18, 19, 8, 20, 14, 4, 21, 13, 20, 22, 8, 2, 3, 4, 5, 6, 20, 8, 9, 10, 2, 10, 11, 12, 13, 8, 18, 14, @@ -206,7 +206,7 @@ std::basic_string example1 = { 9, 8, 6, 7, 3, 7, 23, 4, 41, 7, 51, 8, 48, 69, 43, 6, 4, 9, 11, 20, 2, 13, 32, 5, 8, 18, 16}; -std::basic_string example2 = { +std::vector example2 = { 3, 4, 5, 6, 7, 8, 9, 10, 2, 10, 11, 12, 13, 8, 2, 8, 41, 7, 9, 7, 13, 3, 18, 10, 5, 2, 4, 16, 14, 7, 14, 18, 19, 8, 20, 14, 4, 21, 13, 20, 22, 8, 2, 3, 4, 5, 6, 20, 8, 9, 10, 2, 10, 11, 12, 13, 8, 18, 14, 10, 7, 23, 17, 13, 4, 8, 11, 4, 14, 8, 15, 7, 12, 8, 14, 18, 16, 7, diff --git a/test/distance/examples/pythonLevenshteinIssue9.hpp b/test/distance/examples/pythonLevenshteinIssue9.hpp index b1e78a1e..b6e0cd78 100644 --- a/test/distance/examples/pythonLevenshteinIssue9.hpp +++ b/test/distance/examples/pythonLevenshteinIssue9.hpp @@ -1,8 +1,8 @@ #pragma once #include -#include +#include namespace pythonLevenshteinIssue9 { -extern std::basic_string example1; -extern std::basic_string example2; +extern std::vector example1; +extern std::vector example2; } // namespace pythonLevenshteinIssue9 diff --git a/test/distance/tests-Hamming.cpp b/test/distance/tests-Hamming.cpp index 7fa7187f..8f215791 100644 --- a/test/distance/tests-Hamming.cpp +++ b/test/distance/tests-Hamming.cpp @@ -110,13 +110,13 @@ TEST_CASE("Hamming_editops") { rapidfuzz::Editops ops = rapidfuzz::hamming_editops(s, d); - REQUIRE(d == rapidfuzz::editops_apply(ops, s, d)); + REQUIRE(d == rapidfuzz::editops_apply_str(ops, s, d)); REQUIRE(ops.get_src_len() == s.size()); REQUIRE(ops.get_dest_len() == d.size()); } { rapidfuzz::Editops ops = rapidfuzz::hamming_editops(d, s); - REQUIRE(s == rapidfuzz::editops_apply(ops, d, s)); + REQUIRE(s == rapidfuzz::editops_apply_str(ops, d, s)); REQUIRE(ops.get_src_len() == d.size()); REQUIRE(ops.get_dest_len() == s.size()); } diff --git a/test/distance/tests-Indel.cpp b/test/distance/tests-Indel.cpp index 8b9aedc9..7e96dc5b 100644 --- a/test/distance/tests-Indel.cpp +++ b/test/distance/tests-Indel.cpp @@ -277,7 +277,7 @@ TEST_CASE("Indel") REQUIRE(indel_distance(s1, s2) == 231); rapidfuzz::Editops ops = rapidfuzz::indel_editops(s1, s2); - REQUIRE(s2 == rapidfuzz::editops_apply(ops, s1, s2)); + REQUIRE(s2 == rapidfuzz::editops_apply_str(ops, s1, s2)); } } } diff --git a/test/distance/tests-Levenshtein.cpp b/test/distance/tests-Levenshtein.cpp index 38dea437..1c513b16 100644 --- a/test/distance/tests-Levenshtein.cpp +++ b/test/distance/tests-Levenshtein.cpp @@ -63,9 +63,9 @@ size_t levenshtein_distance(const Sentence1& s1, const Sentence2& s2, } template -std::basic_string get_subsequence(const std::basic_string& s, ptrdiff_t pos, ptrdiff_t len) +std::vector get_subsequence(const std::vector& s, ptrdiff_t pos, ptrdiff_t len) { - return std::basic_string(std::begin(s) + pos, std::begin(s) + pos + len); + return std::vector(std::begin(s) + pos, std::begin(s) + pos + len); } template @@ -233,7 +233,7 @@ TEST_CASE("Levenshtein_editops") std::string d = "XYZLorem ABC iPsum"; rapidfuzz::Editops ops = rapidfuzz::levenshtein_editops(s, d); - REQUIRE(d == rapidfuzz::editops_apply(ops, s, d)); + REQUIRE(d == rapidfuzz::editops_apply_str(ops, s, d)); REQUIRE(ops.get_src_len() == s.size()); REQUIRE(ops.get_dest_len() == d.size()); } @@ -293,21 +293,21 @@ TEST_CASE("Levenshtein_editops[fuzzing_regressions]") std::string s1 = "b"; std::string s2 = "aaaaaaaaaaaaaaaabbaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; rapidfuzz::Editops ops = rapidfuzz::levenshtein_editops(s1, s2); - REQUIRE(s2 == rapidfuzz::editops_apply(ops, s1, s2)); + REQUIRE(s2 == rapidfuzz::editops_apply_str(ops, s1, s2)); } { std::string s1 = "aa"; std::string s2 = "abb"; rapidfuzz::Editops ops = rapidfuzz::levenshtein_editops(s1, s2); - REQUIRE(s2 == rapidfuzz::editops_apply(ops, s1, s2)); + REQUIRE(s2 == rapidfuzz::editops_apply_str(ops, s1, s2)); } { std::string s1 = str_multiply(std::string("abb"), 8 * 64); std::string s2 = str_multiply(std::string("ccccca"), 8 * 64); rapidfuzz::Editops ops = rapidfuzz::levenshtein_editops(s1, s2); - REQUIRE(s2 == rapidfuzz::editops_apply(ops, s1, s2)); + REQUIRE(s2 == rapidfuzz::editops_apply_str(ops, s1, s2)); } } @@ -352,7 +352,7 @@ TEST_CASE("Levenshtein small band") rapidfuzz::Editops ops1; rapidfuzz::detail::levenshtein_align(ops1, rapidfuzz::detail::Range(s1), rapidfuzz::detail::Range(s2)); - REQUIRE(s2 == rapidfuzz::editops_apply(ops1, s1, s2)); + REQUIRE(s2 == rapidfuzz::editops_apply_str(ops1, s1, s2)); rapidfuzz::Editops ops2; rapidfuzz::detail::levenshtein_align(ops2, rapidfuzz::detail::Range(s1), rapidfuzz::detail::Range(s2), ops1.size()); @@ -400,7 +400,7 @@ TEST_CASE("Levenshtein small band") rapidfuzz::Editops ops1; rapidfuzz::detail::levenshtein_align(ops1, rapidfuzz::detail::Range(s1), rapidfuzz::detail::Range(s2)); - REQUIRE(s2 == rapidfuzz::editops_apply(ops1, s1, s2)); + REQUIRE(s2 == rapidfuzz::editops_apply_str(ops1, s1, s2)); rapidfuzz::Editops ops2; rapidfuzz::detail::levenshtein_align(ops2, rapidfuzz::detail::Range(s1), rapidfuzz::detail::Range(s2), ops1.size()); @@ -416,21 +416,21 @@ TEST_CASE("Levenshtein large band (python-Levenshtein issue 9)") REQUIRE(example2.size() == 5569); { - std::basic_string s1 = get_subsequence(example1, 3718, 1509); - std::basic_string s2 = get_subsequence(example2, 2784, 2785); + std::vector s1 = get_subsequence(example1, 3718, 1509); + std::vector s2 = get_subsequence(example2, 2784, 2785); REQUIRE(rapidfuzz::levenshtein_distance(s1, s2) == 1587); rapidfuzz::Editops ops1 = rapidfuzz::levenshtein_editops(s1, s2); REQUIRE(ops1.size() == 1587); - REQUIRE(s2 == rapidfuzz::editops_apply(ops1, s1, s2)); + REQUIRE(s2 == rapidfuzz::editops_apply_vec(ops1, s1, s2)); } { REQUIRE(rapidfuzz::levenshtein_distance(example1, example2) == 2590); rapidfuzz::Editops ops1 = rapidfuzz::levenshtein_editops(example1, example2); REQUIRE(ops1.size() == 2590); - REQUIRE(example2 == rapidfuzz::editops_apply(ops1, example1, example2)); + REQUIRE(example2 == rapidfuzz::editops_apply_vec(ops1, example1, example2)); } } @@ -440,13 +440,13 @@ TEST_CASE("Levenshtein large band (ocr example)") REQUIRE(ocr_example2.size() == 107244); { - std::basic_string s1 = get_subsequence(ocr_example1, 51, 6541); - std::basic_string s2 = get_subsequence(ocr_example2, 51, 6516); + std::vector s1 = get_subsequence(ocr_example1, 51, 6541); + std::vector s2 = get_subsequence(ocr_example2, 51, 6516); rapidfuzz::Editops ops1; rapidfuzz::detail::levenshtein_align(ops1, rapidfuzz::detail::Range(s1), rapidfuzz::detail::Range(s2)); - REQUIRE(s2 == rapidfuzz::editops_apply(ops1, s1, s2)); + REQUIRE(s2 == rapidfuzz::editops_apply_vec(ops1, s1, s2)); rapidfuzz::Editops ops2; rapidfuzz::detail::levenshtein_align(ops2, rapidfuzz::detail::Range(s1), rapidfuzz::detail::Range(s2), ops1.size()); @@ -464,17 +464,17 @@ TEST_CASE("Levenshtein large band (ocr example)") { rapidfuzz::Editops ops1 = rapidfuzz::levenshtein_editops(ocr_example1, ocr_example2); REQUIRE(ops1.size() == 5278); - REQUIRE(ocr_example2 == rapidfuzz::editops_apply(ops1, ocr_example1, ocr_example2)); + REQUIRE(ocr_example2 == rapidfuzz::editops_apply_vec(ops1, ocr_example1, ocr_example2)); } { rapidfuzz::Editops ops1 = rapidfuzz::levenshtein_editops(ocr_example1, ocr_example2, 5278); REQUIRE(ops1.size() == 5278); - REQUIRE(ocr_example2 == rapidfuzz::editops_apply(ops1, ocr_example1, ocr_example2)); + REQUIRE(ocr_example2 == rapidfuzz::editops_apply_vec(ops1, ocr_example1, ocr_example2)); } { rapidfuzz::Editops ops1 = rapidfuzz::levenshtein_editops(ocr_example1, ocr_example2, 2000); REQUIRE(ops1.size() == 5278); - REQUIRE(ocr_example2 == rapidfuzz::editops_apply(ops1, ocr_example1, ocr_example2)); + REQUIRE(ocr_example2 == rapidfuzz::editops_apply_vec(ops1, ocr_example1, ocr_example2)); } }