diff --git a/CHANGELOG.md b/CHANGELOG.md index e7fcda75..f3c3e962 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ ## Changelog +## [3.0.5] - 2024-07-02 +### Fixed +- the editops implementation didn't properly account for some cells in the Levenshtein matrix. + This could lead both to incorrect results and crashes. + ## [3.0.4] - 2023-04-07 ### Fixed - fix tagged version diff --git a/CMakeLists.txt b/CMakeLists.txt index a6bed59e..fcdc0176 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt") endif() -project(rapidfuzz LANGUAGES CXX VERSION 3.0.4) +project(rapidfuzz LANGUAGES CXX VERSION 3.0.5) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake") include(GNUInstallDirs) diff --git a/extras/rapidfuzz_amalgamated.hpp b/extras/rapidfuzz_amalgamated.hpp index f43224f5..e8978d0d 100644 --- a/extras/rapidfuzz_amalgamated.hpp +++ b/extras/rapidfuzz_amalgamated.hpp @@ -1,7 +1,7 @@ // Licensed under the MIT License . // SPDX-License-Identifier: MIT // RapidFuzz v1.0.2 -// Generated: 2024-04-06 15:39:26.940916 +// Generated: 2024-07-02 16:47:26.932914 // ---------------------------------------------------------- // This file is an amalgamation of multiple different files. // You probably shouldn't edit it directly. @@ -7719,6 +7719,9 @@ template HirschbergPos find_hirschberg_pos(const Range& s1, const Range& s2, size_t max = std::numeric_limits::max()) { + assert(s1.size() > 1); + assert(s2.size() > 1); + HirschbergPos hpos = {}; size_t left_size = s2.size() / 2; size_t right_size = s2.size() - left_size; @@ -7727,8 +7730,9 @@ HirschbergPos find_hirschberg_pos(const Range& s1, const Range::max(); size_t right_first_pos = 0; size_t right_last_pos = 0; + // todo: we could avoid this allocation by counting up the right score twice + // not sure whats faster though std::vector right_scores; - { auto right_row = levenshtein_row(s1.reversed(), s2.reversed(), max, right_size - 1); if (right_row.dist > max) return find_hirschberg_pos(s1, s2, max * 2); @@ -7758,6 +7762,17 @@ HirschbergPos find_hirschberg_pos(const Range& s1, const Range= left_first_pos + right_first_pos) { + size_t right_index = s1_len - left_first_pos - right_first_pos; + if (right_index < right_scores.size()) { + best_score = right_scores[right_index] + left_score; + hpos.left_score = left_score; + hpos.right_score = right_scores[right_index]; + hpos.s1_mid = left_first_pos; + } + } + for (size_t i = left_first_pos; i < left_last_pos; ++i) { size_t col_pos = i % 64; size_t col_word = i / 64; diff --git a/rapidfuzz/distance/Levenshtein_impl.hpp b/rapidfuzz/distance/Levenshtein_impl.hpp index 07172477..2fa07f35 100644 --- a/rapidfuzz/distance/Levenshtein_impl.hpp +++ b/rapidfuzz/distance/Levenshtein_impl.hpp @@ -1055,6 +1055,9 @@ template HirschbergPos find_hirschberg_pos(const Range& s1, const Range& s2, size_t max = std::numeric_limits::max()) { + assert(s1.size() > 1); + assert(s2.size() > 1); + HirschbergPos hpos = {}; size_t left_size = s2.size() / 2; size_t right_size = s2.size() - left_size; @@ -1063,8 +1066,9 @@ HirschbergPos find_hirschberg_pos(const Range& s1, const Range::max(); size_t right_first_pos = 0; size_t right_last_pos = 0; + // todo: we could avoid this allocation by counting up the right score twice + // not sure whats faster though std::vector right_scores; - { auto right_row = levenshtein_row(s1.reversed(), s2.reversed(), max, right_size - 1); if (right_row.dist > max) return find_hirschberg_pos(s1, s2, max * 2); @@ -1094,6 +1098,17 @@ HirschbergPos find_hirschberg_pos(const Range& s1, const Range= left_first_pos + right_first_pos) { + size_t right_index = s1_len - left_first_pos - right_first_pos; + if (right_index < right_scores.size()) { + best_score = right_scores[right_index] + left_score; + hpos.left_score = left_score; + hpos.right_score = right_scores[right_index]; + hpos.s1_mid = left_first_pos; + } + } + for (size_t i = left_first_pos; i < left_last_pos; ++i) { size_t col_pos = i % 64; size_t col_word = i / 64;