Add script to pre-process gbif backbone (#3)

* First take on process-backbone.py * Copy original files first * Handle disallowed sources * Update comments and readme * Update readme
biodiversitydata-se · Jan 8, 2024 · b41ac30 · b41ac30
1 parent 8da0efc
commit b41ac30
Show file tree

Hide file tree

Showing 3 changed files with 297 additions and 6 deletions.
diff --git a/Makefile b/Makefile
@@ -9,3 +9,12 @@ run-docker:
 
 release:
 	../sbdi-install/utils/make-release.sh
+
+fetch-backbone:
+	mkdir -p /data/bie-index/import
+	rm -r /data/bie-index/import/backbone
+	wget https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip -O /data/bie-index/import/backbone.zip
+	unzip -q /data/bie-index/import/backbone.zip -d /data/bie-index/import/backbone/
+
+process-backbone:
+	./sbdi/process-backbone.py /data/bie-index/import/backbone
diff --git a/sbdi/README.md b/sbdi/README.md
@@ -37,9 +37,19 @@ Tag 1.0.2 created and pushed.
 
 ## Build search index
 
-The solr search index can be built from scratch in the following way from the /admin page:
-
-* Select **DwCA Import** and import the [GBIF Backbone Taxonomy](https://www.gbif.org/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c) (~5h)
-* Select **Create Links** and run:
-  * **Build search and suggest weights** (~7h)
-  * **Build solr suggestion index** (~15min, application will throw a read timeout exception but indexing will continue to run on Solr)
+The solr search index can be built from scratch from the [GBIF Backbone Taxonomy](https://www.gbif.org/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c). Before importing the data it needs to be pre-processed (see [process-backbone.py](./process-backbone.py) for details). 
+
+* Download backbone (to `/data/bie-index/import`):
+  ```
+  make fetch-backbone
+  ```
+* Pre-process backbone:
+  ```
+  make process-backbone
+  ```
+* Go to the /admin page and select **DwCA Import** and import from `/data/bie-index/import/backbone` (~2:15h)
+* Go to the /admin page and select **Create Links** and run:
+  * **Denormalise taxa** (~8h)
+  * **Build link identifiers** (~7h) (not sure if this is necessary)
+  * **Build search and suggest weights** (~2:15h)
+  * **Build solr suggestion index** (~15min - the application will throw a read timeout exception but indexing will continue to run on Solr)
diff --git a/sbdi/process-backbone.py b/sbdi/process-backbone.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+#
+# This script pre-processes the GBIF Backbone taxonomy before loading it into the bie-index.
+#
+# The original files are renamed (eg. Taxon.tsv -> Taxon.tsv.original) and the processed file
+# is saved with the original name (eg. Taxon.tsv).
+#
+# The following procsessing is done:
+#
+# Taxon
+# -----
+# - Remove scientificNameAuthorship from scientificName (if included) because the bie-index
+#   expects the scientificName to be without authorship.
+#   Eg: Capreolus capreolus (Linnaeus, 1758) -> Capreolus capreolus
+#
+# VernacularName
+# --------------
+# - Only include Swedish and English names
+# - Exclude names from some sources of bad quality
+#
+
+import os
+import sys
+
+ALLOWED_LANGUAGES = [
+    'sv',
+    'en',
+]
+DISALLOWED_SOURCES = [
+    'Belgian Species List', # Contains comma-seprated lists of names
+    # All of these have names in various languages wrongly tagged as English
+    'Abrocomidae',
+    'Acrobatidae',
+    'Ailuridae',
+    'Alpheidae',
+    'Annelida',
+    'Anomaluridae',
+    'Antilocapridae',
+    'Aotidae',
+    'Aplodontiidae',
+    'Atelidae',
+    'Balaenidae',
+    'Balaenopteridae',
+    'Bathyergidae',
+    'Bovidae',
+    'Bradypodidae',
+    'Burramyidae',
+    'Caenolestidae',
+    'Calomyscidae',
+    'Camelidae',
+    'Canidae',
+    'Castoridae',
+    'Caviidae',
+    'Cebidae',
+    'Cercopithecidae',
+    'Cervidae',
+    'Cheirogaleidae',
+    'Chinchillidae',
+    'Chlamyphoridae',
+    'Chrysochloridae',
+    'Cnidaria',
+    'Cricetidae',
+    'Ctenodactylidae',
+    'Ctenomyidae',
+    'Cuniculidae',
+    'Cyclopedidae',
+    'Cynocephalidae',
+    'Dasypodidae',
+    'Dasyproctidae',
+    'Dasyuridae',
+    'Daubentoniidae',
+    'Delphinidae',
+    'Diatomyidae',
+    'Didelphidae',
+    'Dinomyidae',
+    'Dipodidae',
+    'Dugongidae',
+    'Echimyidae',
+    'Echinoderms',
+    'Elephantidae',
+    'Equidae',
+    'Erethizontidae',
+    'Erinaceidae',
+    'Eschrichtiidae',
+    'Eupleridae',
+    'Felidae',
+    'Galagidae',
+    'Geomyidae',
+    'Giraffidae',
+    'Gliridae',
+    'Herpestidae',
+    'Heterocephalidae',
+    'Heteromyidae',
+    'Hippopotamidae',
+    'Hipposideridae',
+    'Hominidae',
+    'Hyaenidae',
+    'Hylobatidae',
+    'Hypsiprymnodontidae',
+    'Hystricidae',
+    'Indriidae',
+    'Iniidae',
+    'Lemuridae',
+    'Lepilemuridae',
+    'Leporidae',
+    'Lipotidae',
+    'Lorisidae',
+    'Macropodidae',
+    'Macroscelididae',
+    'Manidae',
+    'Megalonychidae',
+    'Mephitidae',
+    'Molossidae',
+    'Monodontidae',
+    'Mormoopidae',
+    'Moschidae',
+    'Muridae',
+    'Mustelidae',
+    'Myriatrix',
+    'Myrmecobiidae',
+    'Myrmecophagidae',
+    'Mystacinidae',
+    'Myzopodidae',
+    'Nandiniidae',
+    'Natalidae',
+    'Nayades',
+    'Neobalaenidae',
+    'Nesomyidae',
+    'Noctilionidae',
+    'Notoryctidae',
+    'Nycteridae',
+    'Ochotonidae',
+    'Octodontidae',
+    'Odobenidae',
+    'Orycetropodidae',
+    'Otariidae',
+    'Pedetidae',
+    'Peracarida',
+    'Peramelidae',
+    'Petromuridae',
+    'Phalangeridae',
+    'Phitheciidae',
+    'Phocidae',
+    'Phocoenidae',
+    'Phyllostomidae',
+    'Physeteridae',
+    'Platacanthomyidae',
+    'Platanistidae',
+    'Pontoporiidae',
+    'Porifera',
+    'Potamogalidae',
+    'Potoroidae',
+    'Procaviidae',
+    'Procyonidae',
+    'Pseudocheiridae',
+    'Pteropodidae',
+    'Ptilocercidae',
+    'Rhinocerotidae',
+    'Rhinolophidae',
+    'Rhinonycteridae',
+    'Rhinopomatidae',
+    'Sciuridae',
+    'Sminthidae',
+    'Solenodontidae',
+    'Soricidae',
+    'Spalacidae',
+    'Suidae',
+    'Talpidae',
+    'Tapiridae',
+    'Tarsiidae',
+    'Tarsipedidae',
+    'Tayassuidae',
+    'Tenrecidae',
+    'Thryonomyidae',
+    'Thylacomyidae',
+    'Thyropteridae',
+    'Tragulidae',
+    'Trichechidae',
+    'Tupaiidae',
+    'Ursidae',
+    'Vespertilionidae',
+    'Viverridae',
+    'Vombatidae',
+    'Zapodidae',
+    'Ziphiidae',
+]
+
+def process_taxon(src_dir):
+    print('\nProcess taxon')
+
+    destination_path = f'{src_dir}/Taxon.tsv'
+    original_path = f'{destination_path}.original'
+
+    # Rename original file (if not already done)
+    if not os.path.isfile(original_path):
+        os.rename(destination_path, original_path)
+
+    infile = open(original_path, 'r')
+    outfile = open(destination_path, 'w')
+
+    row_count = 0
+
+    for row in infile:
+
+        record = row.replace('\n', '').split('\t')
+        scientificName = record[5]
+        scientificNameAuthorship = record[6]
+
+        # Remove scientificNameAuthorship from scientificName
+        if scientificNameAuthorship and scientificName.endswith(scientificNameAuthorship):
+            record[5] = scientificName[:-len(scientificNameAuthorship)].strip()
+
+        outfile.write('\t'.join(record) + '\n')
+
+        row_count = row_count + 1
+
+        if row_count % 1000000 == 0:
+            print(f'Processed {row_count} rows')
+
+    outfile.close()
+    infile.close()
+
+    print(f'Done. Processed {row_count} rows')
+
+def process_vernacular_name(src_dir):
+    print('\nProcess vernacular name')
+
+    destination_path = f'{src_dir}/VernacularName.tsv'
+    original_path = f'{destination_path}.original'
+
+    # Rename original file (if not already done)
+    if not os.path.isfile(original_path):
+        os.rename(destination_path, original_path)
+
+    infile = open(original_path, 'r')
+    outfile = open(destination_path, 'w')
+
+    row_count = 0
+    keep_count = 0
+
+    for row in infile:
+
+        record = row.replace('\n', '').split('\t')
+        language = record[2]
+        source = record[7]
+
+        if (row_count == 0 or # Header row
+                (language in ALLOWED_LANGUAGES and
+                 source not in DISALLOWED_SOURCES)):
+            keep_count = keep_count + 1
+            outfile.write(row)
+
+        row_count = row_count + 1
+
+    outfile.close()
+    infile.close()
+
+    print(f'Done. Processed {row_count} rows. Kept {keep_count} rows')
+
+def main(argv):
+
+    src_dir = argv[1] if len(argv) > 1 else '/data/bie-index/import/backbone'
+    print(f'Using {src_dir} as source directory')
+
+    process_taxon(src_dir)
+
+    process_vernacular_name(src_dir)
+
+    print('\nAll done')
+
+if __name__ == '__main__':
+    main(sys.argv)