Skip to content

Commit

Permalink
correct normalize Partial/TokenSort/TokenSet
Browse files Browse the repository at this point in the history
  • Loading branch information
matthieugomez committed Jul 20, 2020
1 parent b5a2a10 commit e0ef0e8
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 11 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "StringDistances"
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.7.0"
version = "0.7.1"

[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Expand Down
4 changes: 2 additions & 2 deletions src/distances/edit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ where ``m`` is the number of matching characters and
struct Jaro <: SemiMetric end

## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
function (dist::Jaro)(s1, s2, ::Nothing)
function (dist::Jaro)(s1, s2, nothing::Nothing = nothing)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
Expand Down Expand Up @@ -180,7 +180,7 @@ region on either side of the longest common subsequence.
"""
struct RatcliffObershelp <: SemiMetric end

function (dist::RatcliffObershelp)(s1, s2, ::Nothing)
function (dist::RatcliffObershelp)(s1, s2, nothing::Nothing = nothing)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
n_matched = sum(last.(matching_blocks(s1, s2)))
Expand Down
10 changes: 5 additions & 5 deletions src/distances/qgram.jl
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ struct QGram <: QGramDistance
q::Int
end

function (dist::QGram)(s1, s2, ::Nothing)
function (dist::QGram)(s1, s2, nothing::Nothing = nothing)
((s1 === missing) | (s2 === missing)) && return missing
n = 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
Expand All @@ -124,7 +124,7 @@ struct Cosine <: QGramDistance
q::Int
end

function (dist::Cosine)(s1, s2, ::Nothing)
function (dist::Cosine)(s1, s2, nothing::Nothing = nothing)
((s1 === missing) | (s2 === missing)) && return missing
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
Expand All @@ -151,7 +151,7 @@ struct Jaccard <: QGramDistance
q::Int
end

function (dist::Jaccard)(s1, s2, ::Nothing)
function (dist::Jaccard)(s1, s2, nothing::Nothing = nothing)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
Expand All @@ -178,7 +178,7 @@ struct SorensenDice <: QGramDistance
q::Int
end

function (dist::SorensenDice)(s1, s2, ::Nothing)
function (dist::SorensenDice)(s1, s2, nothing::Nothing = nothing)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
Expand All @@ -205,7 +205,7 @@ struct Overlap <: QGramDistance
q::Int
end

function (dist::Overlap)(s1, s2, ::Nothing)
function (dist::Overlap)(s1, s2, nothing::Nothing = nothing)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
Expand Down
4 changes: 2 additions & 2 deletions src/modifiers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = nothing)
# Make sure the substring of s2 has length len1
s2_start = r[2] - r[1] + 1
s2_end = s2_start + len1 - 1
if s2_start <= 0
if s2_start < 1
s2_end += 1 - s2_start
s2_start += 1 - s2_start
elseif s2_end > len2
s2_start += len2 - s2_end
s2_end += len2 - s2_end
end
curr = dist.dist(s1, _slice(s2, s2_start - 1, s2_end))
curr = dist.dist(s1, _slice(s2, s2_start, s2_end))
out = min(out, curr)
end
return out
Expand Down
3 changes: 3 additions & 0 deletions src/normalize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ end
Normalize a metric, so that `evaluate` always return a Float64 between 0 and 1
"""
normalize(dist::Partial) = Partial(normalize(dist.dist))
normalize(dist::TokenSort) = TokenSort(normalize(dist.dist))
normalize(dist::TokenSet) = TokenSet(normalize(dist.dist))
normalize(dist::SemiMetric) = Normalized(dist)
normalize(dist::Normalized) = dist

Expand Down
3 changes: 2 additions & 1 deletion test/modifiers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ using StringDistances, Unicode, Test
#Levenshtein
compare("aüa", "aua", Levenshtein())
compare("aüa", "aua", DamerauLevenshtein())
@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0

# Winkler
@test compare("martha", "marhta", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) 0.9611 atol = 1e-4
Expand All @@ -40,7 +41,7 @@ using StringDistances, Unicode, Test
@test compare("New York Yankees", "", Partial(Jaro())) 0.0
@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) 1.0
@test compare("New York Yankees", "", Partial(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) 0.444444444444
#@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) ≈ 0.444444444444
@test compare("HSINCHUANG", "SINJHUAN", Partial(RatcliffObershelp())) 0.875
@test compare("HSINCHUANG", "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) 0.8
@test compare("HSINCHUANG", "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) 0.8
Expand Down

2 comments on commit e0ef0e8

@matthieugomez
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/18185

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.7.1 -m "<description of version>" e0ef0e8ec1a8ab6bb2f004ed68daea381717cce3
git push origin v0.7.1

Please sign in to comment.