Skip to content

Commit

Permalink
Single dedupe method.
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmcclure committed Jul 25, 2018
1 parent 8040295 commit 4cd3ca2
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 29 deletions.
9 changes: 4 additions & 5 deletions litecoder/data/city-alt-names.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
---

# TODO: Separate neighborhoods list, for eg Hollywood / Bronx?
# TODO: Switch to WOF ids, since some don't have Wikidata?

# https://en.wikipedia.org/wiki/List_of_city_nicknames_in_the_United_States
# https://web.archive.org/web/20080701040406/http://www.taglineguru.com:80/citymottosmonikers.html
Expand Down Expand Up @@ -43,6 +44,7 @@ Q1297:

# Houston
Q16555:
- HTX
- Space City
- H-Town

Expand Down Expand Up @@ -109,14 +111,11 @@ Q23556:
- The ATL
- Hotlanta

# Denver
Q16554:
- Mile High City
- The Mile High City

# Houston
Q16555:
- HTX

# Richmond
Q43421:
- RVA
Expand All @@ -125,7 +124,7 @@ Q43421:
Q38022:
- STL

# San Antonia
# San Antonio
Q975:
- SATX

Expand Down
79 changes: 55 additions & 24 deletions litecoder/models/wof_locality_dup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,22 @@
from ..db import session


ID_COLS = (
'dbp_id',
'fb_id',
'fct_id',
'fips_code',
'gn_id',
'gp_id',
'loc_id',
'nyt_id',
'qs_id',
'qs_pg_id',
'wd_id',
'wk_page',
)


class WOFLocalityDup(BaseModel):

__tablename__ = 'wof_locality_dup'
Expand All @@ -28,7 +44,35 @@ def update(cls, wof_ids):
session.commit()

@classmethod
def dedupe_by_proximity(cls, buffer=0.1):
def dedupe_id_col(cls, col_name):
"""Dedupe localities via shared external identifier.
"""
dup_col = getattr(WOFLocality, col_name)

logger.info('Mapping `%s` -> rows' % col_name)

id_rows = defaultdict(list)
for row in tqdm(WOFLocality.query.filter(dup_col != None)):
id_rows[getattr(row, col_name)].append(row)

logger.info('Deduping rows with shared `%s`' % col_name)

dupes = set()
for rows in tqdm(id_rows.values()):
if len(rows) > 1:

# Sort by completeness.
rows = sorted(rows, key=lambda r: r.field_count, reverse=True)

# Add all but most complete to dupes.
for row in rows[1:]:
dupes.add(row.wof_id)

cls.update(dupes)
return len(dupes)

@classmethod
def dedupe_proximity(cls, buffer=0.1):
"""For each locality, get neighbors within N degrees. If any of these
(a) has the same name and (b) has more complete metadata, set dupe.
"""
Expand Down Expand Up @@ -56,29 +100,16 @@ def dedupe_by_proximity(cls, buffer=0.1):
return len(dupes)

@classmethod
def dedupe_shared_id_col(cls, col_name):
"""Dedupe localities via shared external identifier.
def dedupe(cls):
"""Dedupe on all id cols + proximity.
"""
dup_col = getattr(WOFLocality, col_name)

logger.info('Mapping `%s` -> rows' % col_name)

id_rows = defaultdict(list)
for row in tqdm(WOFLocality.query.filter(dup_col != None)):
id_rows[getattr(row, col_name)].append(row)

logger.info('Deduping rows with shared `%s`' % col_name)

dupes = set()
for rows in tqdm(id_rows.values()):
if len(rows) > 1:
for name in ID_COLS:
cls.dedupe_id_col(name)

# Sort by completeness.
rows = sorted(rows, key=lambda r: r.field_count, reverse=True)
cls.dedupe_proximity()

# Add all but most complete to dupes.
for row in rows[1:]:
dupes.add(row.wof_id)

cls.update(dupes)
return len(dupes)
@classmethod
def count_unique(cls):
"""Count unique duplicate rows.
"""
return session.query(cls.wof_id.distinct()).count()

0 comments on commit 4cd3ca2

Please sign in to comment.