From b56fb137360885e81926fe8a4d60138d5d08f910 Mon Sep 17 00:00:00 2001
From: Matthias Als <matt035343@gmail.com>
Date: Thu, 13 Jun 2024 11:26:42 +0200
Subject: [PATCH] Adding Tabu search as new algorithm (#29)

---
 .github/workflows/prepare_release.yaml     |  2 +-
 anti_clustering/__init__.py                |  1 +
 anti_clustering/_cluster_swap_heuristic.py |  4 +-
 anti_clustering/tabu_search_heuristic.py   | 99 ++++++++++++++++++++++
 examples/evaluation.py                     | 28 ++++--
 5 files changed, 124 insertions(+), 10 deletions(-)
 create mode 100644 anti_clustering/tabu_search_heuristic.py

diff --git a/.github/workflows/prepare_release.yaml b/.github/workflows/prepare_release.yaml
index e0d1f31..eb5c98c 100644
--- a/.github/workflows/prepare_release.yaml
+++ b/.github/workflows/prepare_release.yaml
@@ -15,4 +15,4 @@ jobs:
       - uses: SneaksAndData/github-actions/semver_release@v0.1.0
         with:
           major_v: 0
-          minor_v: 3
+          minor_v: 4
diff --git a/anti_clustering/__init__.py b/anti_clustering/__init__.py
index 90715a5..6087b62 100644
--- a/anti_clustering/__init__.py
+++ b/anti_clustering/__init__.py
@@ -17,4 +17,5 @@
 from anti_clustering.naive_random_heuristic import NaiveRandomHeuristicAntiClustering
 from anti_clustering.exact_cluster_editing import ExactClusterEditingAntiClustering
 from anti_clustering.exchange_heuristic import ExchangeHeuristicAntiClustering
+from anti_clustering.tabu_search_heuristic import TabuSearchHeuristicAntiClustering
 from anti_clustering._base import AntiClustering
diff --git a/anti_clustering/_cluster_swap_heuristic.py b/anti_clustering/_cluster_swap_heuristic.py
index 2bf039b..2aac607 100644
--- a/anti_clustering/_cluster_swap_heuristic.py
+++ b/anti_clustering/_cluster_swap_heuristic.py
@@ -84,8 +84,8 @@ def _get_random_clusters(self, num_groups: int, num_elements: int) -> npt.NDArra
     def _calculate_objective(self, cluster_assignment: npt.NDArray[bool], distance_matrix: npt.NDArray[float]) -> float:
         """
         Calculate objective value
-        :param cluster_assignment: Cluster assignment matrix
-        :param distance_matrix: Cost matrix
+        :param cluster_assignment: Cluster assignment
+        :param distance_matrix: Distance matrix
         :return: Objective value
         """
         return np.multiply(cluster_assignment, distance_matrix).sum()
diff --git a/anti_clustering/tabu_search_heuristic.py b/anti_clustering/tabu_search_heuristic.py
new file mode 100644
index 0000000..df11246
--- /dev/null
+++ b/anti_clustering/tabu_search_heuristic.py
@@ -0,0 +1,99 @@
+# Copyright 2022 ECCO Sneaks & Data
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A tabu search with restarts approach to solving the anti-clustering problem.
+"""
+
+import numpy.typing as npt
+from anti_clustering._cluster_swap_heuristic import ClusterSwapHeuristic
+
+
+class TabuSearchHeuristicAntiClustering(ClusterSwapHeuristic):
+    """
+    A tabu search with restarts approach to solving the anti-clustering problem.
+    In this version, specific transformations are put in the tabu list not solutions.
+    """
+
+    def __init__(
+        self,
+        verbose: bool = False,
+        random_seed: int = None,
+        tabu_tenure: int = 10,
+        iterations: int = 2000,
+        restarts: int = 9,
+    ):
+        # pylint: disable = R0913
+        super().__init__(verbose=verbose, random_seed=random_seed)
+        self.tabu_tenure = tabu_tenure
+        self.iterations = iterations
+        self.restarts = restarts
+
+    def _solve(self, distance_matrix: npt.NDArray[float], num_groups: int) -> npt.NDArray[bool]:
+        # Start with random cluster assignment
+        cluster_assignment = self._get_random_clusters(num_groups=num_groups, num_elements=len(distance_matrix))
+
+        if self.verbose:
+            print("Solving")
+
+        candidate_solutions = []
+
+        for restart in range(self.restarts):
+            tabu_swaps = []
+            # Initial objective value
+            objective = self._calculate_objective(cluster_assignment, distance_matrix)
+            for iteration in range(self.iterations):
+                if self.verbose and iteration % 5 == 0:
+                    print(f"Iteration {iteration + 1} of {self.iterations}")
+
+                # Select random element
+                i = self.rnd.randint(0, len(distance_matrix) - 1)
+
+                # Get possible swaps
+                possible_exchanges = [
+                    j
+                    for j in self._get_exchanges(cluster_assignment, i)
+                    if (i, j) not in tabu_swaps and (j, i) not in tabu_swaps
+                ]
+
+                if len(possible_exchanges) == 0:
+                    continue
+
+                # Generate possible assignments
+                j = possible_exchanges[self.rnd.randint(0, len(possible_exchanges) - 1)]
+
+                # Select random possible swap.
+                new_cluster_assignment = self._swap(cluster_assignment, i, j)
+                new_objective = self._calculate_objective(new_cluster_assignment, distance_matrix)
+
+                # Select solution as current if it improves the objective value
+                if new_objective > objective:
+                    cluster_assignment = new_cluster_assignment
+                    objective = new_objective
+                    tabu_swaps.append((i, j))
+                    # Delete oldest tabu swap if tabu list is full
+                    if len(tabu_swaps) > self.tabu_tenure:
+                        tabu_swaps.pop(0)
+
+            candidate_solutions.append((objective, cluster_assignment))
+
+            if self.verbose:
+                print(f"Restart {restart + 1} of {self.restarts}")
+
+            # Cold restart, select random cluster assignment
+            cluster_assignment = self._get_random_clusters(num_groups=num_groups, num_elements=len(distance_matrix))
+
+        # Select best solution, maximizing objective
+        _, best_cluster_assignment = max(candidate_solutions, key=lambda x: x[0])
+
+        return best_cluster_assignment
diff --git a/examples/evaluation.py b/examples/evaluation.py
index aeb0e47..4587e71 100644
--- a/examples/evaluation.py
+++ b/examples/evaluation.py
@@ -25,6 +25,7 @@
     ExchangeHeuristicAntiClustering,
     SimulatedAnnealingHeuristicAntiClustering,
     NaiveRandomHeuristicAntiClustering,
+    TabuSearchHeuristicAntiClustering,
     ExactClusterEditingAntiClustering,
     AntiClustering,
 )
@@ -36,15 +37,18 @@
 iris_df = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)
 
 methods: List[AntiClustering] = [
+    TabuSearchHeuristicAntiClustering(iterations=5000, restarts=10, tabu_tenure=50),
     ExchangeHeuristicAntiClustering(restarts=20),
     SimulatedAnnealingHeuristicAntiClustering(alpha=0.95, iterations=5000, starting_temperature=1000, restarts=20),
     NaiveRandomHeuristicAntiClustering(),
-    ExactClusterEditingAntiClustering(),
+    # ExactClusterEditingAntiClustering(), # This method is extremely slow for large datasets
 ]
 
-for method in methods:
-    for k in range(2, 4):
-        print(f"Method: {method.__class__.__name__}, clusters: {k}")
+for k in range(2, 4):
+    print(f"------------- Number of clusters: {k} -------------")
+    summary = []
+    for method in methods:
+        print(f"Running method: {method.__class__.__name__}")
 
         start_time = time.time()
         df = method.run(
@@ -63,6 +67,16 @@
         # Mean of differences
         mean_df = difference_df.reset_index(level=[1]).groupby(["level_1"]).mean()
 
-        print(f"∆M: {mean_df.loc['mean'][0]}")
-        print(f"∆SD: {mean_df.loc['std'][0]}")
-        print(f"Running time: {time_taken}s")
+        summary.append(
+            pd.DataFrame(
+                {
+                    "Method": [method.__class__.__name__],
+                    "Clusters": [k],
+                    "∆M": [round(mean_df.loc["mean"][0], 4)],
+                    "∆SD": [round(mean_df.loc["std"][0], 4)],
+                    "Time (s)": [time_taken],
+                }
+            )
+        )
+    print("Summary (lower ∆M and ∆SD is better):")
+    print(pd.concat(summary).to_string())