From b56fb137360885e81926fe8a4d60138d5d08f910 Mon Sep 17 00:00:00 2001 From: Matthias Als Date: Thu, 13 Jun 2024 11:26:42 +0200 Subject: [PATCH] Adding Tabu search as new algorithm (#29) --- .github/workflows/prepare_release.yaml | 2 +- anti_clustering/__init__.py | 1 + anti_clustering/_cluster_swap_heuristic.py | 4 +- anti_clustering/tabu_search_heuristic.py | 99 ++++++++++++++++++++++ examples/evaluation.py | 28 ++++-- 5 files changed, 124 insertions(+), 10 deletions(-) create mode 100644 anti_clustering/tabu_search_heuristic.py diff --git a/.github/workflows/prepare_release.yaml b/.github/workflows/prepare_release.yaml index e0d1f31..eb5c98c 100644 --- a/.github/workflows/prepare_release.yaml +++ b/.github/workflows/prepare_release.yaml @@ -15,4 +15,4 @@ jobs: - uses: SneaksAndData/github-actions/semver_release@v0.1.0 with: major_v: 0 - minor_v: 3 + minor_v: 4 diff --git a/anti_clustering/__init__.py b/anti_clustering/__init__.py index 90715a5..6087b62 100644 --- a/anti_clustering/__init__.py +++ b/anti_clustering/__init__.py @@ -17,4 +17,5 @@ from anti_clustering.naive_random_heuristic import NaiveRandomHeuristicAntiClustering from anti_clustering.exact_cluster_editing import ExactClusterEditingAntiClustering from anti_clustering.exchange_heuristic import ExchangeHeuristicAntiClustering +from anti_clustering.tabu_search_heuristic import TabuSearchHeuristicAntiClustering from anti_clustering._base import AntiClustering diff --git a/anti_clustering/_cluster_swap_heuristic.py b/anti_clustering/_cluster_swap_heuristic.py index 2bf039b..2aac607 100644 --- a/anti_clustering/_cluster_swap_heuristic.py +++ b/anti_clustering/_cluster_swap_heuristic.py @@ -84,8 +84,8 @@ def _get_random_clusters(self, num_groups: int, num_elements: int) -> npt.NDArra def _calculate_objective(self, cluster_assignment: npt.NDArray[bool], distance_matrix: npt.NDArray[float]) -> float: """ Calculate objective value - :param cluster_assignment: Cluster assignment matrix - :param distance_matrix: Cost matrix + :param cluster_assignment: Cluster assignment + :param distance_matrix: Distance matrix :return: Objective value """ return np.multiply(cluster_assignment, distance_matrix).sum() diff --git a/anti_clustering/tabu_search_heuristic.py b/anti_clustering/tabu_search_heuristic.py new file mode 100644 index 0000000..df11246 --- /dev/null +++ b/anti_clustering/tabu_search_heuristic.py @@ -0,0 +1,99 @@ +# Copyright 2022 ECCO Sneaks & Data +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A tabu search with restarts approach to solving the anti-clustering problem. +""" + +import numpy.typing as npt +from anti_clustering._cluster_swap_heuristic import ClusterSwapHeuristic + + +class TabuSearchHeuristicAntiClustering(ClusterSwapHeuristic): + """ + A tabu search with restarts approach to solving the anti-clustering problem. + In this version, specific transformations are put in the tabu list not solutions. + """ + + def __init__( + self, + verbose: bool = False, + random_seed: int = None, + tabu_tenure: int = 10, + iterations: int = 2000, + restarts: int = 9, + ): + # pylint: disable = R0913 + super().__init__(verbose=verbose, random_seed=random_seed) + self.tabu_tenure = tabu_tenure + self.iterations = iterations + self.restarts = restarts + + def _solve(self, distance_matrix: npt.NDArray[float], num_groups: int) -> npt.NDArray[bool]: + # Start with random cluster assignment + cluster_assignment = self._get_random_clusters(num_groups=num_groups, num_elements=len(distance_matrix)) + + if self.verbose: + print("Solving") + + candidate_solutions = [] + + for restart in range(self.restarts): + tabu_swaps = [] + # Initial objective value + objective = self._calculate_objective(cluster_assignment, distance_matrix) + for iteration in range(self.iterations): + if self.verbose and iteration % 5 == 0: + print(f"Iteration {iteration + 1} of {self.iterations}") + + # Select random element + i = self.rnd.randint(0, len(distance_matrix) - 1) + + # Get possible swaps + possible_exchanges = [ + j + for j in self._get_exchanges(cluster_assignment, i) + if (i, j) not in tabu_swaps and (j, i) not in tabu_swaps + ] + + if len(possible_exchanges) == 0: + continue + + # Generate possible assignments + j = possible_exchanges[self.rnd.randint(0, len(possible_exchanges) - 1)] + + # Select random possible swap. + new_cluster_assignment = self._swap(cluster_assignment, i, j) + new_objective = self._calculate_objective(new_cluster_assignment, distance_matrix) + + # Select solution as current if it improves the objective value + if new_objective > objective: + cluster_assignment = new_cluster_assignment + objective = new_objective + tabu_swaps.append((i, j)) + # Delete oldest tabu swap if tabu list is full + if len(tabu_swaps) > self.tabu_tenure: + tabu_swaps.pop(0) + + candidate_solutions.append((objective, cluster_assignment)) + + if self.verbose: + print(f"Restart {restart + 1} of {self.restarts}") + + # Cold restart, select random cluster assignment + cluster_assignment = self._get_random_clusters(num_groups=num_groups, num_elements=len(distance_matrix)) + + # Select best solution, maximizing objective + _, best_cluster_assignment = max(candidate_solutions, key=lambda x: x[0]) + + return best_cluster_assignment diff --git a/examples/evaluation.py b/examples/evaluation.py index aeb0e47..4587e71 100644 --- a/examples/evaluation.py +++ b/examples/evaluation.py @@ -25,6 +25,7 @@ ExchangeHeuristicAntiClustering, SimulatedAnnealingHeuristicAntiClustering, NaiveRandomHeuristicAntiClustering, + TabuSearchHeuristicAntiClustering, ExactClusterEditingAntiClustering, AntiClustering, ) @@ -36,15 +37,18 @@ iris_df = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names) methods: List[AntiClustering] = [ + TabuSearchHeuristicAntiClustering(iterations=5000, restarts=10, tabu_tenure=50), ExchangeHeuristicAntiClustering(restarts=20), SimulatedAnnealingHeuristicAntiClustering(alpha=0.95, iterations=5000, starting_temperature=1000, restarts=20), NaiveRandomHeuristicAntiClustering(), - ExactClusterEditingAntiClustering(), + # ExactClusterEditingAntiClustering(), # This method is extremely slow for large datasets ] -for method in methods: - for k in range(2, 4): - print(f"Method: {method.__class__.__name__}, clusters: {k}") +for k in range(2, 4): + print(f"------------- Number of clusters: {k} -------------") + summary = [] + for method in methods: + print(f"Running method: {method.__class__.__name__}") start_time = time.time() df = method.run( @@ -63,6 +67,16 @@ # Mean of differences mean_df = difference_df.reset_index(level=[1]).groupby(["level_1"]).mean() - print(f"∆M: {mean_df.loc['mean'][0]}") - print(f"∆SD: {mean_df.loc['std'][0]}") - print(f"Running time: {time_taken}s") + summary.append( + pd.DataFrame( + { + "Method": [method.__class__.__name__], + "Clusters": [k], + "∆M": [round(mean_df.loc["mean"][0], 4)], + "∆SD": [round(mean_df.loc["std"][0], 4)], + "Time (s)": [time_taken], + } + ) + ) + print("Summary (lower ∆M and ∆SD is better):") + print(pd.concat(summary).to_string())