From 4cd3ca2227513f991ec6f734630ca98b8b0e81f4 Mon Sep 17 00:00:00 2001 From: David McClure Date: Wed, 25 Jul 2018 10:17:01 -0400 Subject: [PATCH] Single dedupe method. --- litecoder/data/city-alt-names.yml | 9 ++-- litecoder/models/wof_locality_dup.py | 79 +++++++++++++++++++--------- 2 files changed, 59 insertions(+), 29 deletions(-) diff --git a/litecoder/data/city-alt-names.yml b/litecoder/data/city-alt-names.yml index b33df20..74ac042 100644 --- a/litecoder/data/city-alt-names.yml +++ b/litecoder/data/city-alt-names.yml @@ -1,6 +1,7 @@ --- # TODO: Separate neighborhoods list, for eg Hollywood / Bronx? +# TODO: Switch to WOF ids, since some don't have Wikidata? # https://en.wikipedia.org/wiki/List_of_city_nicknames_in_the_United_States # https://web.archive.org/web/20080701040406/http://www.taglineguru.com:80/citymottosmonikers.html @@ -43,6 +44,7 @@ Q1297: # Houston Q16555: + - HTX - Space City - H-Town @@ -109,14 +111,11 @@ Q23556: - The ATL - Hotlanta +# Denver Q16554: - Mile High City - The Mile High City -# Houston -Q16555: - - HTX - # Richmond Q43421: - RVA @@ -125,7 +124,7 @@ Q43421: Q38022: - STL -# San Antonia +# San Antonio Q975: - SATX diff --git a/litecoder/models/wof_locality_dup.py b/litecoder/models/wof_locality_dup.py index b6df1a8..af50912 100644 --- a/litecoder/models/wof_locality_dup.py +++ b/litecoder/models/wof_locality_dup.py @@ -12,6 +12,22 @@ from ..db import session +ID_COLS = ( + 'dbp_id', + 'fb_id', + 'fct_id', + 'fips_code', + 'gn_id', + 'gp_id', + 'loc_id', + 'nyt_id', + 'qs_id', + 'qs_pg_id', + 'wd_id', + 'wk_page', +) + + class WOFLocalityDup(BaseModel): __tablename__ = 'wof_locality_dup' @@ -28,7 +44,35 @@ def update(cls, wof_ids): session.commit() @classmethod - def dedupe_by_proximity(cls, buffer=0.1): + def dedupe_id_col(cls, col_name): + """Dedupe localities via shared external identifier. + """ + dup_col = getattr(WOFLocality, col_name) + + logger.info('Mapping `%s` -> rows' % col_name) + + id_rows = defaultdict(list) + for row in tqdm(WOFLocality.query.filter(dup_col != None)): + id_rows[getattr(row, col_name)].append(row) + + logger.info('Deduping rows with shared `%s`' % col_name) + + dupes = set() + for rows in tqdm(id_rows.values()): + if len(rows) > 1: + + # Sort by completeness. + rows = sorted(rows, key=lambda r: r.field_count, reverse=True) + + # Add all but most complete to dupes. + for row in rows[1:]: + dupes.add(row.wof_id) + + cls.update(dupes) + return len(dupes) + + @classmethod + def dedupe_proximity(cls, buffer=0.1): """For each locality, get neighbors within N degrees. If any of these (a) has the same name and (b) has more complete metadata, set dupe. """ @@ -56,29 +100,16 @@ def dedupe_by_proximity(cls, buffer=0.1): return len(dupes) @classmethod - def dedupe_shared_id_col(cls, col_name): - """Dedupe localities via shared external identifier. + def dedupe(cls): + """Dedupe on all id cols + proximity. """ - dup_col = getattr(WOFLocality, col_name) - - logger.info('Mapping `%s` -> rows' % col_name) - - id_rows = defaultdict(list) - for row in tqdm(WOFLocality.query.filter(dup_col != None)): - id_rows[getattr(row, col_name)].append(row) - - logger.info('Deduping rows with shared `%s`' % col_name) - - dupes = set() - for rows in tqdm(id_rows.values()): - if len(rows) > 1: + for name in ID_COLS: + cls.dedupe_id_col(name) - # Sort by completeness. - rows = sorted(rows, key=lambda r: r.field_count, reverse=True) + cls.dedupe_proximity() - # Add all but most complete to dupes. - for row in rows[1:]: - dupes.add(row.wof_id) - - cls.update(dupes) - return len(dupes) + @classmethod + def count_unique(cls): + """Count unique duplicate rows. + """ + return session.query(cls.wof_id.distinct()).count()