Skip to content

Commit

Permalink
fix wraparound issue in Jaro simd
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Oct 31, 2023
1 parent 39d36d5 commit 79fdab2
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 63 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## Changelog

## [2.2.1] - 2023-10-31
### Fixed
- fix wraparound issue in simd implementation of Jaro and Jaro Winkler

## [2.2.0] - 2023-10-30
#### Performance
- improve performance of simd implementation for LCS and Indel by up to 50%
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt")
endif()

project(rapidfuzz LANGUAGES CXX VERSION 2.2.0)
project(rapidfuzz LANGUAGES CXX VERSION 2.2.1)

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
include(GNUInstallDirs)
Expand Down
82 changes: 52 additions & 30 deletions extras/rapidfuzz_amalgamated.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
// SPDX-License-Identifier: MIT
// RapidFuzz v1.0.2
// Generated: 2023-10-28 05:15:10.035216
// Generated: 2023-10-31 01:19:54.085893
// ----------------------------------------------------------
// This file is an amalgamation of multiple different files.
// You probably shouldn't edit it directly.
Expand Down Expand Up @@ -5506,6 +5506,7 @@ static inline size_t count_transpositions_word(const PM_Vec& PM, Range<InputIt1>
{
uint64_t P_flag = flagged.P_flag;
uint64_t T_flag = flagged.T_flag;

size_t Transpositions = 0;
while (T_flag) {
uint64_t PatternFlagMask = blsi(P_flag);
Expand Down Expand Up @@ -5719,6 +5720,12 @@ static inline auto jaro_similarity_prepare_bound(const VecType* s1_lengths, Rang
native_simd<VecType> zero(VecType(0));
native_simd<VecType> one(1);

VecType maxLen = 0;
// todo permutate + max to find maxLen
// side-note: we know only the first 8 bit are actually used
for (size_t i = 0; i < vec_width; ++i)
if (s1_lengths[i] > maxLen) maxLen = s1_lengths[i];

// since the normal implementation would overflow in this case + s2 is guaranteed to be longer
// handle this case separately
if (static_cast<size_t>(s2.size()) > sizeof(VecType) * 8) {
Expand All @@ -5744,23 +5751,15 @@ static inline auto jaro_similarity_prepare_bound(const VecType* s1_lengths, Rang
bounds.boundMaskSize = sllv(one, boundSizes << 1) - one;
bounds.boundMask = sllv(one, boundSizes + one) - one;

VecType maxLen = 0;
// todo permutate + max to find maxLen
// side-note: we know only the first 8 bit are actually used
alignas(alignment) std::array<VecType, vec_width> lens_;
s1_lengths_simd.store(lens_.data());
for (size_t i = 0; i < vec_width; ++i)
if (lens_[i] > maxLen) maxLen = lens_[i];

bounds.maxBound =
(s2.size() > static_cast<ptrdiff_t>(maxLen)) ? s2.size() : static_cast<ptrdiff_t>(maxLen);
bounds.maxBound /= 2;
if (bounds.maxBound > 0) bounds.maxBound--;

int64_t lastRelevantChar = maxLen + bounds.maxBound;
if (s2.size() > lastRelevantChar) s2.remove_suffix(s2.size() - lastRelevantChar);
}

int64_t lastRelevantChar = maxLen + bounds.maxBound;
if (s2.size() > lastRelevantChar) s2.remove_suffix(s2.size() - lastRelevantChar);

return bounds;
}
# else
Expand All @@ -5774,14 +5773,19 @@ static inline auto jaro_similarity_prepare_bound(const VecType* s1_lengths, Rang

JaroSimilaritySimdBounds<native_simd<VecType>> bounds;

VecType maxLen = 0;
// todo permutate + max to find maxLen
// side-note: we know only the first 8 bit are actually used
for (size_t i = 0; i < vec_width; ++i)
if (s1_lengths[i] > maxLen) maxLen = s1_lengths[i];

if (static_cast<size_t>(s2.size()) > sizeof(VecType) * 8) {
bounds.maxBound = s2.size() / 2 - 1;
bounds.boundMaskSize =
native_simd<VecType>(bit_mask_lsb<VecType>(static_cast<int>(2 * bounds.maxBound)));
bounds.boundMask = native_simd<VecType>(bit_mask_lsb<VecType>(static_cast<int>(bounds.maxBound + 1)));
}
else {
int64_t lastRelevantChar = 0;
alignas(alignment) std::array<VecType, vec_width> boundMaskSize_;
alignas(alignment) std::array<VecType, vec_width> boundMask_;

Expand All @@ -5790,8 +5794,6 @@ static inline auto jaro_similarity_prepare_bound(const VecType* s1_lengths, Rang
int64_t s1_len = static_cast<int64_t>(s1_lengths[i]);
int64_t Bound = jaro_bounds(s1_len, s2.size());

if (s1_len + Bound > lastRelevantChar) lastRelevantChar = s1_len + Bound;

if (Bound > bounds.maxBound) bounds.maxBound = Bound;

boundMaskSize_[i] = bit_mask_lsb<VecType>(static_cast<int>(2 * Bound));
Expand All @@ -5800,10 +5802,11 @@ static inline auto jaro_similarity_prepare_bound(const VecType* s1_lengths, Rang

bounds.boundMaskSize = native_simd<VecType>(reinterpret_cast<uint64_t*>(boundMaskSize_.data()));
bounds.boundMask = native_simd<VecType>(reinterpret_cast<uint64_t*>(boundMask_.data()));

if (s2.size() > lastRelevantChar) s2.remove_suffix(s2.size() - lastRelevantChar);
}

int64_t lastRelevantChar = maxLen + bounds.maxBound;
if (s2.size() > lastRelevantChar) s2.remove_suffix(s2.size() - lastRelevantChar);

return bounds;
}
# endif
Expand Down Expand Up @@ -5847,8 +5850,12 @@ static inline void jaro_similarity_simd(Range<double*> scores, const detail::Blo
auto bounds = jaro_similarity_prepare_bound(s1_lengths + result_index, s2_cur);

native_simd<VecType> P_flag(VecType(0));
native_simd<VecType> T_flag(VecType(0));
native_simd<VecType> counter(VecType(1));
// the second sequence can have a length of up to 2 * VecType since anything above this would
// be removed in jaro_similarity_prepare_bound
std::array<native_simd<VecType>, 2> T_flag(
{native_simd<VecType>(VecType(0)), native_simd<VecType>(VecType(0))});
std::array<native_simd<VecType>, 2> counter(
{native_simd<VecType>(VecType(1)), native_simd<VecType>(VecType(1))});

// In case s2 is longer than all of the elements in s1_lengths boundMaskSize
// might have all bits set and therefor the condition ((boundMask <= boundMaskSize) & one)
Expand All @@ -5863,9 +5870,10 @@ static inline void jaro_similarity_simd(Range<double*> scores, const detail::Blo
native_simd<VecType> PM_j = andnot(X & bounds.boundMask, P_flag);

P_flag |= blsi(PM_j);
T_flag |= andnot(counter, (PM_j == zero));
auto T_word_index = j / (sizeof(VecType) * 8);
T_flag[T_word_index] |= andnot(counter[T_word_index], (PM_j == zero));

counter = counter << 1;
counter[T_word_index] = counter[T_word_index] << 1;
bounds.boundMask = (bounds.boundMask << 1) | ((bounds.boundMask <= bounds.boundMaskSize) & one);
}

Expand All @@ -5876,17 +5884,20 @@ static inline void jaro_similarity_simd(Range<double*> scores, const detail::Blo
native_simd<VecType> PM_j = andnot(X & bounds.boundMask, P_flag);

P_flag |= blsi(PM_j);
T_flag |= andnot(counter, (PM_j == zero));
auto T_word_index = j / (sizeof(VecType) * 8);
T_flag[T_word_index] |= andnot(counter[T_word_index], (PM_j == zero));

counter = counter << 1;
counter[T_word_index] = counter[T_word_index] << 1;
bounds.boundMask = bounds.boundMask << 1;
}

auto counts = popcount(P_flag);
alignas(alignment) std::array<VecType, vec_width> P_flags;
P_flag.store(P_flags.data());
alignas(alignment) std::array<VecType, vec_width> T_flags;
T_flag.store(T_flags.data());
alignas(alignment) std::array<VecType, vec_width> T_flags0;
T_flag[0].store(T_flags0.data());
alignas(alignment) std::array<VecType, vec_width> T_flags1;
T_flag[1].store(T_flags1.data());
for (size_t i = 0; i < vec_width; ++i) {
VecType CommonChars = counts[i];
if (!jaro_common_char_filter(static_cast<int64_t>(s1_lengths[result_index]), s2.size(),
Expand All @@ -5898,19 +5909,30 @@ static inline void jaro_similarity_simd(Range<double*> scores, const detail::Blo
}

VecType P_flag_cur = P_flags[i];
VecType T_flag_cur = T_flags[i];
VecType T_flag0_cur = T_flags0[i];
VecType T_flag1_cur = T_flags1[i];
size_t Transpositions = 0;

static constexpr size_t vecs_per_word = vec_width / vecs;
size_t cur_block = i / vecs_per_word;
int64_t offset = static_cast<int64_t>(sizeof(VecType) * 8 * (i % vecs_per_word));
while (T_flag_cur) {
while (T_flag0_cur) {
VecType PatternFlagMask = blsi(P_flag_cur);

uint64_t PM_j = block.get(cur_block, s2[countr_zero(T_flag0_cur)]);
Transpositions += !(PM_j & (static_cast<uint64_t>(PatternFlagMask) << offset));

T_flag0_cur = blsr(T_flag0_cur);
P_flag_cur ^= PatternFlagMask;
}

while (T_flag1_cur) {
VecType PatternFlagMask = blsi(P_flag_cur);

uint64_t PM_j = block.get(cur_block, s2[countr_zero(T_flag_cur)]);
uint64_t PM_j = block.get(cur_block, s2[countr_zero(T_flag1_cur) + sizeof(VecType) * 8]);
Transpositions += !(PM_j & (static_cast<uint64_t>(PatternFlagMask) << offset));

T_flag_cur = blsr(T_flag_cur);
T_flag1_cur = blsr(T_flag1_cur);
P_flag_cur ^= PatternFlagMask;
}

Expand Down Expand Up @@ -6326,7 +6348,7 @@ struct MultiJaroWinkler : public detail::MultiSimilarityBase<MultiJaroWinkler<Ma
friend detail::MultiNormalizedMetricBase<MultiJaroWinkler<MaxLen>, double>;

public:
MultiJaroWinkler(size_t count, double prefix_weight_) : scorer(count), prefix_weight(prefix_weight_)
MultiJaroWinkler(size_t count, double prefix_weight_ = 0.1) : scorer(count), prefix_weight(prefix_weight_)
{}

/**
Expand Down
2 changes: 1 addition & 1 deletion rapidfuzz/distance/JaroWinkler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ struct MultiJaroWinkler : public detail::MultiSimilarityBase<MultiJaroWinkler<Ma
friend detail::MultiNormalizedMetricBase<MultiJaroWinkler<MaxLen>, double>;

public:
MultiJaroWinkler(size_t count, double prefix_weight_) : scorer(count), prefix_weight(prefix_weight_)
MultiJaroWinkler(size_t count, double prefix_weight_ = 0.1) : scorer(count), prefix_weight(prefix_weight_)
{}

/**
Expand Down
Loading

0 comments on commit 79fdab2

Please sign in to comment.