From 80716d2df3cf3b77aa5496d69d6df040cf0c6e39 Mon Sep 17 00:00:00 2001 From: Neil Rotstan Date: Mon, 9 Dec 2019 13:56:01 -0800 Subject: [PATCH] Support meta-data for suspicion reasons * Represent suspicion reasons with a new Reason class that, in addition to the string identifier, supports a label and source/origin of the reason * Continue to store string identifiers in the Changeset.suspicion_reasons field for backward compatibility * Introduce a new Changeset.detailed_reasons field that has the full Reason objects * Use the Reason.source to track when suspicion reasons originated as warnings generated by an editor, such as iD * When printing suspicion reasons in the CLI, include the source of the reason if it was not from osmcha * Update unit tests --- osmcha/changeset.py | 39 ++++++++++++++++++++++++--------------- osmcha/scripts/cli.py | 12 +++++++++++- tests/test_mod.py | 13 +++++++++---- 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/osmcha/changeset.py b/osmcha/changeset.py index f6d4a1a..92068a0 100644 --- a/osmcha/changeset.py +++ b/osmcha/changeset.py @@ -42,6 +42,14 @@ class InvalidChangesetError(Exception): pass +class Reason: + """Details and meta-data for a suspicion reason.""" + def __init__(self, identifier, label=None, source="osmcha"): + self.identifier = identifier + self.label = label if label else identifier + self.source = source + + def get_user_details(user_id): """Get information about number of changesets, blocks and mapping days of a user, using both the OSM API and the Mapbox comments APIself. @@ -56,7 +64,7 @@ def get_user_details(user_id): changesets = [i for i in xml_data if i.tag == 'changesets'][0] blocks = [i for i in xml_data if i.tag == 'blocks'][0] if int(changesets.get('count')) <= 5: - reasons.append('New mapper') + reasons.append(Reason('New mapper')) elif int(changesets.get('count')) <= 30: url = MAPBOX_USERS_API.format( user_id=requests.compat.quote(user_id) @@ -67,9 +75,9 @@ def get_user_details(user_id): user_request.json().get('extra').get('mapping_days') ) if mapping_days <= 5: - reasons.append('New mapper') + reasons.append(Reason('New mapper')) if int(blocks[0].get('count')) > 1: - reasons.append('User has multiple blocks') + reasons.append(Reason('User has multiple blocks')) except Exception as e: message = 'Could not verify user of the changeset: {}, {}' print(message.format(user_id, str(e))) @@ -239,7 +247,6 @@ def filter(self): if get_bounds(ch).intersects(self.area) ] - class Analyse(object): """Analyse a changeset and define if it is suspect.""" def __init__(self, changeset, create_threshold=200, modify_threshold=200, @@ -299,6 +306,7 @@ def set_fields(self, changeset): '%Y-%m-%dT%H:%M:%SZ' ) self.suspicion_reasons = [] + self.detailed_reasons = {} self.is_suspect = False self.powerfull_editor = False self.warning_tags = [ @@ -307,7 +315,8 @@ def set_fields(self, changeset): def label_suspicious(self, reason): """Add suspicion reason and set the suspicious flag.""" - self.suspicion_reasons.append(reason) + self.suspicion_reasons.append(reason.identifier) + self.detailed_reasons[reason.identifier] = reason self.is_suspect = True def full_analysis(self): @@ -318,12 +327,12 @@ def full_analysis(self): self.verify_warning_tags() if self.review_requested == 'yes': - self.label_suspicious('Review requested') + self.label_suspicious(Reason('Review requested')) def verify_warning_tags(self): for tag in self.warning_tags: if tag in self.enabled_warnings.keys(): - self.label_suspicious(self.enabled_warnings.get(tag)) + self.label_suspicious(Reason(self.enabled_warnings.get(tag), None, self.editor)) def verify_user(self): """Verify if the changeset was made by a inexperienced mapper (anyone @@ -338,7 +347,7 @@ def verify_words(self): """ if self.comment: if find_words(self.comment, self.suspect_words, self.excluded_words): - self.label_suspicious('suspect_word') + self.label_suspicious(Reason('suspect_word')) if self.source: for word in self.illegal_sources: @@ -346,13 +355,13 @@ def verify_words(self): if word == 'yandex' and 'yandex panorama' in self.source.lower(): pass else: - self.label_suspicious('suspect_word') + self.label_suspicious(Reason('suspect_word')) break if self.imagery_used: for word in self.illegal_sources: if word in self.imagery_used.lower(): - self.label_suspicious('suspect_word') + self.label_suspicious(Reason('suspect_word')) break self.suspicion_reasons = list(set(self.suspicion_reasons)) @@ -383,10 +392,10 @@ def verify_editor(self): 'mapwith.ai' ] if self.host.split('://')[-1].split('/')[0] not in trusted_hosts: - self.label_suspicious('Unknown iD instance') + self.label_suspicious(Reason('Unknown iD instance')) else: self.powerfull_editor = True - self.label_suspicious('Software editor was not declared') + self.label_suspicious(Reason('Software editor was not declared')) def count(self): """Count the number of elements created, modified and deleted by the @@ -404,14 +413,14 @@ def count(self): if (self.create / len(actions) > self.percentage and self.create > self.create_threshold and (self.powerfull_editor or self.create > self.top_threshold)): - self.label_suspicious('possible import') + self.label_suspicious(Reason('possible import')) elif (self.modify / len(actions) > self.percentage and self.modify > self.modify_threshold): - self.label_suspicious('mass modification') + self.label_suspicious(Reason('mass modification')) elif ((self.delete / len(actions) > self.percentage and self.delete > self.delete_threshold) or self.delete > self.top_threshold): - self.label_suspicious('mass deletion') + self.label_suspicious(Reason('mass deletion')) except ZeroDivisionError: print('It seems this changeset was redacted') diff --git a/osmcha/scripts/cli.py b/osmcha/scripts/cli.py index d180a19..98e4102 100644 --- a/osmcha/scripts/cli.py +++ b/osmcha/scripts/cli.py @@ -14,9 +14,19 @@ def cli(id): 'Created: %s. Modified: %s. Deleted: %s' % (ch.create, ch.modify, ch.delete) ) if ch.is_suspect: + reasonDescriptions = [formatted_reason(reason) for reason in ch.detailed_reasons.values()] click.echo('The changeset {} is suspect! Reasons: {}'.format( id, - ', '.join(ch.suspicion_reasons) + ', '.join(reasonDescriptions) )) else: click.echo('The changeset %s is not suspect!' % id) + +def formatted_reason(reason): + """ + Generate a formatted description of the reason using its label and appending + the source of the reason if it was not from osmcha (e.g. if it originated + from an editor warning) + """ + sourceLabel = ' ({})'.format(reason.source) if reason.source != 'osmcha' else '' + return '{}{}'.format(reason.label, sourceLabel) diff --git a/tests/test_mod.py b/tests/test_mod.py index ce45a92..586cd91 100644 --- a/tests/test_mod.py +++ b/tests/test_mod.py @@ -5,6 +5,7 @@ from shapely.geometry import Polygon from osmcha.changeset import ChangesetList +from osmcha.changeset import Reason from osmcha.changeset import Analyse from osmcha.changeset import WORDS from osmcha.changeset import find_words @@ -100,8 +101,9 @@ def test_analyse_label_suspicious(): ]) } ch = Analyse(ch_dict) - ch.label_suspicious('some reason') + ch.label_suspicious(Reason('some reason')) assert 'some reason' in ch.suspicion_reasons + assert ch.detailed_reasons['some reason'].identifier == 'some reason' assert ch.is_suspect @@ -601,10 +603,11 @@ def test_get_dict(): assert 'is_suspect' in ch.get_dict().keys() assert 'powerfull_editor' in ch.get_dict().keys() assert 'suspicion_reasons' in ch.get_dict().keys() + assert 'detailed_reasons' in ch.get_dict().keys() assert 'create' in ch.get_dict().keys() assert 'modify' in ch.get_dict().keys() assert 'delete' in ch.get_dict().keys() - assert len(ch.get_dict().keys()) == 15 + assert len(ch.get_dict().keys()) == 16 # An iD changeset with warnings: ch = Analyse(72783703) @@ -621,10 +624,11 @@ def test_get_dict(): assert 'is_suspect' in ch.get_dict().keys() assert 'powerfull_editor' in ch.get_dict().keys() assert 'suspicion_reasons' in ch.get_dict().keys() + assert 'detailed_reasons' in ch.get_dict().keys() assert 'create' in ch.get_dict().keys() assert 'modify' in ch.get_dict().keys() assert 'delete' in ch.get_dict().keys() - assert len(ch.get_dict().keys()) == 15 + assert len(ch.get_dict().keys()) == 16 # A JOSM changeset ch = Analyse(46315321) @@ -641,10 +645,11 @@ def test_get_dict(): assert 'is_suspect' in ch.get_dict().keys() assert 'powerfull_editor' in ch.get_dict().keys() assert 'suspicion_reasons' in ch.get_dict().keys() + assert 'detailed_reasons' in ch.get_dict().keys() assert 'create' in ch.get_dict().keys() assert 'modify' in ch.get_dict().keys() assert 'delete' in ch.get_dict().keys() - assert len(ch.get_dict().keys()) == 15 + assert len(ch.get_dict().keys()) == 16 def test_changeset_without_tags():