Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sparse dot update #1

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions polyfuzz/models/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,8 @@
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity as scikit_cosine_similarity

try:
from sparse_dot_topn import awesome_cossim_topn
_HAVE_SPARSE_DOT = True
except ImportError:
_HAVE_SPARSE_DOT = False

from polyfuzz.models._utils_sdtn import _HAVE_SPARSE_DOT, sp_matmul_topn

def cosine_similarity(from_vector: np.ndarray,
to_vector: np.ndarray,
Expand Down Expand Up @@ -69,17 +65,16 @@ def cosine_similarity(from_vector: np.ndarray,

similarities = [np.round(1 - distances[:, i], 3) for i in range(distances.shape[1])]

# Fast, but does has some installation issues
# Fast
elif _HAVE_SPARSE_DOT and method == "sparse":
if isinstance(to_vector, np.ndarray):
to_vector = csr_matrix(to_vector)
if isinstance(from_vector, np.ndarray):
from_vector = csr_matrix(from_vector)

# There is a bug with awesome_cossim_topn that when to_vector and from_vector
# have the same shape, setting topn to 1 does not work. Apparently, you need
# to it at least to 2 for it to work
similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, top_n+1, min_similarity)
similarity_matrix = sp_matmul_topn(
from_vector, to_vector, top_n=top_n, threshold=min_similarity, sort=True
)

if to_list is None:
similarity_matrix = similarity_matrix.tolil()
Expand Down
33 changes: 33 additions & 0 deletions polyfuzz/models/_utils_sdtn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import sys
import importlib.util
from scipy.sparse import csr_matrix

from typing import Optional

_HAVE_SPARSE_DOT = importlib.util.find_spec("sparse_dot_topn") is not None
if _HAVE_SPARSE_DOT:
if sys.version_info >= (3, 8):
from sparse_dot_topn import sp_matmul_topn
else:
from sparse_dot_topn import awesome_cossim_topn

def sp_matmul_topn(
A: csr_matrix,
B: csr_matrix,
top_n: int,
threshold: float,
sort: bool = True,
n_threads: Optional[int] = None,
):
n_threads = n_threads or 1
use_threads = n_threads > 1
return awesome_cossim_topn(
A,
B.T,
ntop=max(top_n, 2),
lower_bound=threshold,
use_threads=use_threads,
n_jobs=n_threads,
)

__all__ = ["sp_matmul_topn"]
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
]

fast_cosine = [
"sparse_dot_topn>=0.2.9"
"sparse_dot_topn<1.0; python_version < '3.8'",
"sparse_dot_topn>=1.1.1; python_version >= '3.8'",
]

embeddings_packages = [
Expand Down
Loading