From bea15fbe6ea6b7f781684c286c36eab3ef5bf654 Mon Sep 17 00:00:00 2001 From: zzzz-vincent Date: Fri, 3 Mar 2023 00:22:41 -0800 Subject: [PATCH 1/3] add jaccard as additional distance metrics. --- docs/source/api/index.rst | 1 + quaterion/distances/__init__.py | 4 +++- quaterion/distances/jaccard.py | 37 +++++++++++++++++++++++++++++++++ tests/test_distances.py | 11 ++++++++++ 4 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 quaterion/distances/jaccard.py diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst index 1e9a9542..ea6a8823 100644 --- a/docs/source/api/index.rst +++ b/docs/source/api/index.rst @@ -49,6 +49,7 @@ DISTANCES ~dot_product.DotProduct ~euclidean.Euclidean ~manhattan.Manhattan + ~jaccard.Jaccard EVAL ---- diff --git a/quaterion/distances/__init__.py b/quaterion/distances/__init__.py index 69b5fe39..42e6cdd1 100644 --- a/quaterion/distances/__init__.py +++ b/quaterion/distances/__init__.py @@ -5,7 +5,7 @@ from quaterion.distances.dot_product import DotProduct from quaterion.distances.euclidean import Euclidean from quaterion.distances.manhattan import Manhattan - +from quaterion.distances.jaccard import Jaccard class Distance(str, Enum): """An enumerator to pass distance metric names across the package.""" @@ -14,6 +14,7 @@ class Distance(str, Enum): COSINE = "cosine" DOT_PRODUCT = "dot_product" MANHATTAN = "manhattan" + JACCARD = "jaccard" @staticmethod def get_by_name(name: str) -> BaseDistance: @@ -26,6 +27,7 @@ def get_by_name(name: str) -> BaseDistance: "euclidean": Euclidean, "manhattan": Manhattan, "dot_product": DotProduct, + "jaccard": Jaccard, } try: diff --git a/quaterion/distances/jaccard.py b/quaterion/distances/jaccard.py new file mode 100644 index 00000000..69ede8ce --- /dev/null +++ b/quaterion/distances/jaccard.py @@ -0,0 +1,37 @@ +from typing import Optional + +import torch +import torch.nn.functional as F +from torch import Tensor + +from quaterion.distances.base_distance import BaseDistance + + +class Jaccard(BaseDistance): + """Compute Weighted Jaccard distances (and its interpretation as similarities). + + Note: + The implementation of Weighted Jaccard + (https://en.wikipedia.org/wiki/Jaccard_index#Weighted_Jaccard_similarity_and_distance) + supports Tensors with postivie float values. + """ + + @staticmethod + def distance(x: Tensor, y: Tensor) -> Tensor: + return 1 - Jaccard.similarity(x, y) + + @staticmethod + def similarity(x: Tensor, y: Tensor) -> Tensor: + min_sum = torch.minimum(x, y).sum(dim=-1) + max_sum = torch.maximum(x, y).sum(dim=-1) + return min_sum / max_sum + + @staticmethod + def distance_matrix(x: Tensor, y: Optional[Tensor] = None) -> Tensor: + return 1 - Jaccard.similarity_matrix(x.unsqueeze(1), y.unsqueeze(0)) + + @staticmethod + def similarity_matrix(x: Tensor, y: Optional[Tensor] = None) -> Tensor: + if y is None: + y = x + return Jaccard.similarity(x.unsqueeze(1), y.unsqueeze(0)) diff --git a/tests/test_distances.py b/tests/test_distances.py index c48524be..88b4de48 100644 --- a/tests/test_distances.py +++ b/tests/test_distances.py @@ -12,6 +12,13 @@ class TestDistances: ] ) + x_2 = torch.tensor( + [ + [1.0, 1.5, 2.0, 3.0], + [0.5, 2.5, 2.5, 1.0], + ] + ) + x_dim = x.size()[0] expected = { "cosine": { @@ -30,6 +37,10 @@ class TestDistances: "similarity_matrix": torch.tensor([[16.25, -16.25], [-16.25, 16.25]]), "distance_matrix": torch.tensor([[-16.25, 16.25], [16.25, -16.25]]), }, + "jaccard": { + "similarity_matrix": torch.tensor([[1.0000, 0.5556], [0.5556, 1.0000]]), + "distance_matrix": torch.tensor([[0.0000, 0.4444], [0.4444, 0.0000]]), + } } @pytest.mark.parametrize( From e56f277fa6a45701913170e36ed47343877d1759 Mon Sep 17 00:00:00 2001 From: zzzz-vincent Date: Fri, 3 Mar 2023 00:25:54 -0800 Subject: [PATCH 2/3] fix typo --- quaterion/distances/jaccard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quaterion/distances/jaccard.py b/quaterion/distances/jaccard.py index 69ede8ce..22807934 100644 --- a/quaterion/distances/jaccard.py +++ b/quaterion/distances/jaccard.py @@ -13,7 +13,7 @@ class Jaccard(BaseDistance): Note: The implementation of Weighted Jaccard (https://en.wikipedia.org/wiki/Jaccard_index#Weighted_Jaccard_similarity_and_distance) - supports Tensors with postivie float values. + supports Tensors with positive float values. """ @staticmethod From 230511bce99d165d2a10980ad75d3c68a4359e89 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Mar 2023 08:28:23 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- quaterion/distances/__init__.py | 3 ++- tests/test_distances.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/quaterion/distances/__init__.py b/quaterion/distances/__init__.py index 42e6cdd1..8799060b 100644 --- a/quaterion/distances/__init__.py +++ b/quaterion/distances/__init__.py @@ -4,8 +4,9 @@ from quaterion.distances.cosine import Cosine from quaterion.distances.dot_product import DotProduct from quaterion.distances.euclidean import Euclidean -from quaterion.distances.manhattan import Manhattan from quaterion.distances.jaccard import Jaccard +from quaterion.distances.manhattan import Manhattan + class Distance(str, Enum): """An enumerator to pass distance metric names across the package.""" diff --git a/tests/test_distances.py b/tests/test_distances.py index 88b4de48..f4d2c294 100644 --- a/tests/test_distances.py +++ b/tests/test_distances.py @@ -37,10 +37,10 @@ class TestDistances: "similarity_matrix": torch.tensor([[16.25, -16.25], [-16.25, 16.25]]), "distance_matrix": torch.tensor([[-16.25, 16.25], [16.25, -16.25]]), }, - "jaccard": { - "similarity_matrix": torch.tensor([[1.0000, 0.5556], [0.5556, 1.0000]]), - "distance_matrix": torch.tensor([[0.0000, 0.4444], [0.4444, 0.0000]]), - } + "jaccard": { + "similarity_matrix": torch.tensor([[1.0000, 0.5556], [0.5556, 1.0000]]), + "distance_matrix": torch.tensor([[0.0000, 0.4444], [0.4444, 0.0000]]), + }, } @pytest.mark.parametrize(