From 348eee5bb0af7b0c36cb3b1d41487ebe8f504728 Mon Sep 17 00:00:00 2001 From: Collin Fair Date: Thu, 25 Feb 2016 22:42:53 +0000 Subject: [PATCH] Improve efficiency of activity deduplication algorithm. --- tapiriik/services/interchange.py | 21 +++++++++++++++++++++ tapiriik/sync/sync.py | 25 ++++++++++++++++++------- tapiriik/testing/sync.py | 4 ++-- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/tapiriik/services/interchange.py b/tapiriik/services/interchange.py index 1e157dd3a..03f21b87f 100644 --- a/tapiriik/services/interchange.py +++ b/tapiriik/services/interchange.py @@ -332,6 +332,27 @@ def __eq__(self, other): def __ne__(self, other): return not self.__eq__(other) + # We define ascending as most-recent first. + # For simplicity, we compare without timezones. + # The only place this ordering is (intentionally...) used, it doesn't matter. + def __gt__(self, other): + try: + return self.StartTime.replace(tzinfo=None) < other.StartTime.replace(tzinfo=None) + except AttributeError: + return self.StartTime.replace(tzinfo=None) < other.replace(tzinfo=None) + + def __ge__(self, other): + try: + return self.StartTime.replace(tzinfo=None) <= other.StartTime.replace(tzinfo=None) + except AttributeError: + return self.StartTime.replace(tzinfo=None) <= other.replace(tzinfo=None) + + def __lt__(self, other): + return not self.__ge__(other) + + def __le__(self, other): + return not self.__gt__(other) + class UploadedActivity (Activity): pass # will contain list of which service instances contain this activity - not really merited diff --git a/tapiriik/sync/sync.py b/tapiriik/sync/sync.py index 2b5967357..3245cd380 100644 --- a/tapiriik/sync/sync.py +++ b/tapiriik/sync/sync.py @@ -17,8 +17,9 @@ import pytz import kombu import json +import bisect -# Set this up seperate from the logger used in this scope, so services logging messages are caught and logged into user's files. +# Set this up separate from the logger used in this scope, so services logging messages are caught and logged into user's files. _global_logger = logging.getLogger("tapiriik") _global_logger.setLevel(logging.DEBUG) @@ -439,8 +440,13 @@ def _accumulateActivities(self, conn, svcActivities, no_add=False): if act.TZ and not hasattr(act.TZ, "localize"): raise ValueError("Got activity with TZ type " + str(type(act.TZ)) + " instead of a pytz timezone") # Used to ensureTZ() right here - doubt it's needed any more? - existElsewhere = [ - x for x in self._activities if + # Binsearch to find which activities actually need individual attention. + # Otherwise it's O(mn^2). + # self._activities is sorted most recent first + relevantActivitiesStart = bisect.bisect_left(self._activities, act.StartTime + timezoneErrorPeriod) + relevantActivitiesEnd = bisect.bisect_right(self._activities, act.StartTime - timezoneErrorPeriod, lo=relevantActivitiesStart) + extantActIter = ( + x for x in (self._activities[idx] for idx in range(relevantActivitiesStart, relevantActivitiesEnd)) if ( # Identical x.UID == act.UID @@ -481,9 +487,14 @@ def _accumulateActivities(self, conn, svcActivities, no_add=False): and # Prevents closely-spaced activities of known different type from being lumped together - esp. important for manually-enetered ones (x.Type == ActivityType.Other or act.Type == ActivityType.Other or x.Type == act.Type or ActivityType.AreVariants([act.Type, x.Type])) - ] - if len(existElsewhere) > 0: - existingActivity = existElsewhere[0] + ) + + try: + existingActivity = next(extantActIter) + except StopIteration: + existingActivity = None + + if existingActivity: # we don't merge the exclude values here, since at this stage the services have the option of just not returning those activities if act.TZ is not None and existingActivity.TZ is None: existingActivity.TZ = act.TZ @@ -522,7 +533,7 @@ def _accumulateActivities(self, conn, svcActivities, no_add=False): act.UIDs = existingActivity.UIDs # stop the circular inclusion, not that it matters continue if not no_add: - self._activities.append(act) + bisect.insort_left(self._activities, act) def _determineEligibleRecipientServices(self, activity, recipientServices): from tapiriik.auth import User diff --git a/tapiriik/testing/sync.py b/tapiriik/testing/sync.py index 8096505fb..430988619 100644 --- a/tapiriik/testing/sync.py +++ b/tapiriik/testing/sync.py @@ -354,8 +354,8 @@ def test_activity_coalesce(self): self.assertEqual(act.StartTime.tzinfo, actA.StartTime.tzinfo) self.assertLapsListsEqual(act.Laps, actA.Laps) self.assertTrue(act.Private) # Most restrictive setting - self.assertEqual(act.Name, actB.Name) # The first activity takes priority. - self.assertEqual(act.Type, actB.Type) # Same here. + self.assertEqual(act.Name, actA.Name) # The later activity takes priority. + self.assertEqual(act.Type, actA.Type) # Same here. self.assertTrue(list(actB.ServiceDataCollection.keys())[0] in act.ServiceDataCollection) self.assertTrue(list(actA.ServiceDataCollection.keys())[0] in act.ServiceDataCollection)