Skip to content

Commit

Permalink
Speed up id col dedupe.
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmcclure committed Jul 25, 2018
1 parent 85df11a commit 8040295
Showing 1 changed file with 18 additions and 14 deletions.
32 changes: 18 additions & 14 deletions litecoder/models/wof_locality_dup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
from sqlalchemy import Column, Integer, ForeignKey, func

from tqdm import tqdm
from collections import defaultdict
from scipy.spatial import cKDTree

from .base import BaseModel
from .wof_locality import WOFLocality
from .. import logger
from ..db import session


Expand Down Expand Up @@ -51,30 +53,32 @@ def dedupe_by_proximity(cls, buffer=0.1):
)

cls.update(dupes)
return len(dupes)

@classmethod
def dedupe_shared_id_col(cls, col_name):
"""Dedupe localities via shared external identifier.
"""
dup_col = getattr(WOFLocality, col_name)

# Select ids with 2+ records.
query = (session
.query(dup_col)
.filter(dup_col != None)
.group_by(dup_col)
.having(func.count(WOFLocality.wof_id) > 1)
.all())
logger.info('Mapping `%s` -> rows' % col_name)

id_rows = defaultdict(list)
for row in tqdm(WOFLocality.query.filter(dup_col != None)):
id_rows[getattr(row, col_name)].append(row)

logger.info('Deduping rows with shared `%s`' % col_name)

dupes = set()
for r in tqdm(query):
for rows in tqdm(id_rows.values()):
if len(rows) > 1:

# Load rows, sort by completeness.
rows = WOFLocality.query.filter(dup_col==r[0])
rows = sorted(rows, key=lambda r: r.field_count, reverse=True)
# Sort by completeness.
rows = sorted(rows, key=lambda r: r.field_count, reverse=True)

# Add all but most complete to dupes.
for row in rows[1:]:
dupes.add(row.wof_id)
# Add all but most complete to dupes.
for row in rows[1:]:
dupes.add(row.wof_id)

cls.update(dupes)
return len(dupes)

0 comments on commit 8040295

Please sign in to comment.