Skip to content

Commit

Permalink
Add script to pre-process gbif backbone (#3)
Browse files Browse the repository at this point in the history
* First take on process-backbone.py

* Copy original files first

* Handle disallowed sources

* Update comments and readme

* Update readme
  • Loading branch information
matsbov authored Jan 8, 2024
1 parent 8da0efc commit b41ac30
Show file tree
Hide file tree
Showing 3 changed files with 297 additions and 6 deletions.
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,12 @@ run-docker:

release:
../sbdi-install/utils/make-release.sh

fetch-backbone:
mkdir -p /data/bie-index/import
rm -r /data/bie-index/import/backbone
wget https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip -O /data/bie-index/import/backbone.zip
unzip -q /data/bie-index/import/backbone.zip -d /data/bie-index/import/backbone/

process-backbone:
./sbdi/process-backbone.py /data/bie-index/import/backbone
22 changes: 16 additions & 6 deletions sbdi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,19 @@ Tag 1.0.2 created and pushed.

## Build search index

The solr search index can be built from scratch in the following way from the /admin page:

* Select **DwCA Import** and import the [GBIF Backbone Taxonomy](https://www.gbif.org/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c) (~5h)
* Select **Create Links** and run:
* **Build search and suggest weights** (~7h)
* **Build solr suggestion index** (~15min, application will throw a read timeout exception but indexing will continue to run on Solr)
The solr search index can be built from scratch from the [GBIF Backbone Taxonomy](https://www.gbif.org/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c). Before importing the data it needs to be pre-processed (see [process-backbone.py](./process-backbone.py) for details).

* Download backbone (to `/data/bie-index/import`):
```
make fetch-backbone
```
* Pre-process backbone:
```
make process-backbone
```
* Go to the /admin page and select **DwCA Import** and import from `/data/bie-index/import/backbone` (~2:15h)
* Go to the /admin page and select **Create Links** and run:
* **Denormalise taxa** (~8h)
* **Build link identifiers** (~7h) (not sure if this is necessary)
* **Build search and suggest weights** (~2:15h)
* **Build solr suggestion index** (~15min - the application will throw a read timeout exception but indexing will continue to run on Solr)
272 changes: 272 additions & 0 deletions sbdi/process-backbone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
#!/usr/bin/env python3
#
# This script pre-processes the GBIF Backbone taxonomy before loading it into the bie-index.
#
# The original files are renamed (eg. Taxon.tsv -> Taxon.tsv.original) and the processed file
# is saved with the original name (eg. Taxon.tsv).
#
# The following procsessing is done:
#
# Taxon
# -----
# - Remove scientificNameAuthorship from scientificName (if included) because the bie-index
# expects the scientificName to be without authorship.
# Eg: Capreolus capreolus (Linnaeus, 1758) -> Capreolus capreolus
#
# VernacularName
# --------------
# - Only include Swedish and English names
# - Exclude names from some sources of bad quality
#

import os
import sys

ALLOWED_LANGUAGES = [
'sv',
'en',
]
DISALLOWED_SOURCES = [
'Belgian Species List', # Contains comma-seprated lists of names
# All of these have names in various languages wrongly tagged as English
'Abrocomidae',
'Acrobatidae',
'Ailuridae',
'Alpheidae',
'Annelida',
'Anomaluridae',
'Antilocapridae',
'Aotidae',
'Aplodontiidae',
'Atelidae',
'Balaenidae',
'Balaenopteridae',
'Bathyergidae',
'Bovidae',
'Bradypodidae',
'Burramyidae',
'Caenolestidae',
'Calomyscidae',
'Camelidae',
'Canidae',
'Castoridae',
'Caviidae',
'Cebidae',
'Cercopithecidae',
'Cervidae',
'Cheirogaleidae',
'Chinchillidae',
'Chlamyphoridae',
'Chrysochloridae',
'Cnidaria',
'Cricetidae',
'Ctenodactylidae',
'Ctenomyidae',
'Cuniculidae',
'Cyclopedidae',
'Cynocephalidae',
'Dasypodidae',
'Dasyproctidae',
'Dasyuridae',
'Daubentoniidae',
'Delphinidae',
'Diatomyidae',
'Didelphidae',
'Dinomyidae',
'Dipodidae',
'Dugongidae',
'Echimyidae',
'Echinoderms',
'Elephantidae',
'Equidae',
'Erethizontidae',
'Erinaceidae',
'Eschrichtiidae',
'Eupleridae',
'Felidae',
'Galagidae',
'Geomyidae',
'Giraffidae',
'Gliridae',
'Herpestidae',
'Heterocephalidae',
'Heteromyidae',
'Hippopotamidae',
'Hipposideridae',
'Hominidae',
'Hyaenidae',
'Hylobatidae',
'Hypsiprymnodontidae',
'Hystricidae',
'Indriidae',
'Iniidae',
'Lemuridae',
'Lepilemuridae',
'Leporidae',
'Lipotidae',
'Lorisidae',
'Macropodidae',
'Macroscelididae',
'Manidae',
'Megalonychidae',
'Mephitidae',
'Molossidae',
'Monodontidae',
'Mormoopidae',
'Moschidae',
'Muridae',
'Mustelidae',
'Myriatrix',
'Myrmecobiidae',
'Myrmecophagidae',
'Mystacinidae',
'Myzopodidae',
'Nandiniidae',
'Natalidae',
'Nayades',
'Neobalaenidae',
'Nesomyidae',
'Noctilionidae',
'Notoryctidae',
'Nycteridae',
'Ochotonidae',
'Octodontidae',
'Odobenidae',
'Orycetropodidae',
'Otariidae',
'Pedetidae',
'Peracarida',
'Peramelidae',
'Petromuridae',
'Phalangeridae',
'Phitheciidae',
'Phocidae',
'Phocoenidae',
'Phyllostomidae',
'Physeteridae',
'Platacanthomyidae',
'Platanistidae',
'Pontoporiidae',
'Porifera',
'Potamogalidae',
'Potoroidae',
'Procaviidae',
'Procyonidae',
'Pseudocheiridae',
'Pteropodidae',
'Ptilocercidae',
'Rhinocerotidae',
'Rhinolophidae',
'Rhinonycteridae',
'Rhinopomatidae',
'Sciuridae',
'Sminthidae',
'Solenodontidae',
'Soricidae',
'Spalacidae',
'Suidae',
'Talpidae',
'Tapiridae',
'Tarsiidae',
'Tarsipedidae',
'Tayassuidae',
'Tenrecidae',
'Thryonomyidae',
'Thylacomyidae',
'Thyropteridae',
'Tragulidae',
'Trichechidae',
'Tupaiidae',
'Ursidae',
'Vespertilionidae',
'Viverridae',
'Vombatidae',
'Zapodidae',
'Ziphiidae',
]

def process_taxon(src_dir):
print('\nProcess taxon')

destination_path = f'{src_dir}/Taxon.tsv'
original_path = f'{destination_path}.original'

# Rename original file (if not already done)
if not os.path.isfile(original_path):
os.rename(destination_path, original_path)

infile = open(original_path, 'r')
outfile = open(destination_path, 'w')

row_count = 0

for row in infile:

record = row.replace('\n', '').split('\t')
scientificName = record[5]
scientificNameAuthorship = record[6]

# Remove scientificNameAuthorship from scientificName
if scientificNameAuthorship and scientificName.endswith(scientificNameAuthorship):
record[5] = scientificName[:-len(scientificNameAuthorship)].strip()

outfile.write('\t'.join(record) + '\n')

row_count = row_count + 1

if row_count % 1000000 == 0:
print(f'Processed {row_count} rows')

outfile.close()
infile.close()

print(f'Done. Processed {row_count} rows')

def process_vernacular_name(src_dir):
print('\nProcess vernacular name')

destination_path = f'{src_dir}/VernacularName.tsv'
original_path = f'{destination_path}.original'

# Rename original file (if not already done)
if not os.path.isfile(original_path):
os.rename(destination_path, original_path)

infile = open(original_path, 'r')
outfile = open(destination_path, 'w')

row_count = 0
keep_count = 0

for row in infile:

record = row.replace('\n', '').split('\t')
language = record[2]
source = record[7]

if (row_count == 0 or # Header row
(language in ALLOWED_LANGUAGES and
source not in DISALLOWED_SOURCES)):
keep_count = keep_count + 1
outfile.write(row)

row_count = row_count + 1

outfile.close()
infile.close()

print(f'Done. Processed {row_count} rows. Kept {keep_count} rows')

def main(argv):

src_dir = argv[1] if len(argv) > 1 else '/data/bie-index/import/backbone'
print(f'Using {src_dir} as source directory')

process_taxon(src_dir)

process_vernacular_name(src_dir)

print('\nAll done')

if __name__ == '__main__':
main(sys.argv)

0 comments on commit b41ac30

Please sign in to comment.