Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjust extractor workflow #3

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Consider the records language-specific
Extract abstracts and otherSubjects
Read nwbib dump line by line
Petra Maier committed Jan 9, 2025
commit d47a8c7b09584d4d61eb651dc2cbfc99745e378b
87 changes: 56 additions & 31 deletions nwbib_extractor.py
Original file line number Diff line number Diff line change
@@ -3,12 +3,12 @@
import json
from random import shuffle
import re
from os.path import join, splitext
from os import listdir
import sys

# curl --header "Accept-Encoding: gzip" "http://lobid.org/resources/search?q=inCollection.id%3A%22http%3A%2F%2Flobid.org%2Fresources%2FHT014176012%23%21%22&format=jsonl" > nwbib.gz
# gunzip it (file name "nwbib")

CHUNK_DIR = "chunks"
NWBIB_FILE = "nwbib"
TARGET_TRAIN_FILE = "nwbib_subjects_train.tsv"
TARGET_TEST_FILE = "nwbib_subjects_test.tsv"
TARGET_NO_SUBJECTS_FILE = "nwbib_unindexed_titles.txt"
@@ -35,24 +35,38 @@ def extract_data(record):
ret = {
'title': '',
'otherTitleInformation': '',
'subjects': []
'subjects': [],
'language': [],
'otherSubjects': [],
'abstract': ''
}
ret['title'] = record.get('title', '')
if 'otherTitleInformation' in record:
ret['otherTitleInformation'] = ', '.join(record['otherTitleInformation'])
subjects = record.get('subject', [])
for subject_dict in subjects:
source_id = subject_dict.get("id", '')
if source_id.startswith("https://nwbib.de/subjects"):
label = subject_dict.get("label", '')
if SKOS_VOCAB_TERMS is None:
ret["subjects"].append((source_id, label))
else:
if source_id in SKOS_VOCAB_TERMS:
ret["subjects"].append((source_id, label))
else:
msg = 'Warning: Subject {} ({}) not found in provided SKOS vocabulary - skipping'
print(msg.format(source_id, label))
lang_ids = ["http://id.loc.gov/vocabulary/iso639-2/ger", "http://id.loc.gov/vocabulary/iso639-2/eng"]
if record.get("language") is not None:
for rec_lang in record.get("language"):
lang_id = rec_lang.get("id")
if lang_id in lang_ids:
ret["language"] = record.get('language', [])
ret['title'] = record.get('title', '')
if 'otherTitleInformation' in record:
ret['otherTitleInformation'] = ', '.join(record['otherTitleInformation'])
if "abstract" in record:
ret['abstract'] = "".join(record["abstract"])
subjects = record.get('subject', [])
for subject_dict in subjects:
source_id = subject_dict.get("id", '')
label = subject_dict.get("label", '')
if source_id.startswith("https://nwbib.de/subjects"):
if SKOS_VOCAB_TERMS is None:
ret["subjects"].append((source_id, label))
else:
if source_id in SKOS_VOCAB_TERMS:
ret["subjects"].append((source_id, label))
else:
msg = 'Warning: Subject {} ({}) not found in provided SKOS vocabulary - skipping'
print(msg.format(source_id, label))
else:
ret["otherSubjects"].append(label)

return ret

def _extract_voc_terms(voc_file_path):
@@ -67,7 +81,10 @@ def _extract_voc_terms(voc_file_path):
SKOS_VOCAB_TERMS.append(term)

def _prepare_tsv_data(record):
combined_title = record["title"] if not record["otherTitleInformation"] else record["title"] + " - " + record["otherTitleInformation"]
#combined_title = record["title"] if not record["otherTitleInformation"] else record["title"] + " - " + record["otherTitleInformation"]
comb_title = {k: v for k, v in record.items() if v}
del comb_title["subjects"], comb_title["language"]
combined_title = ' '.join(str(v) for k, v in comb_title.items())
subjects = ["<" + subject_tup[0] + ">" for subject_tup in record["subjects"]]
line = [combined_title] + subjects
return line
@@ -87,6 +104,16 @@ def _print_stats(stats):
print("{}: {}".format(k, v))
print("\n\n")

def filter_language(record):
lang_ids = []
if record.get("language") is not None:
for rec_lang in record.get("language"):
lang_id = rec_lang.get("id")
lang_ids.append(lang_id)
lang_id_list = "".join(lang_ids)
if "http://id.loc.gov/vocabulary/iso639-2/ger" in lang_id_list or "http://id.loc.gov/vocabulary/iso639-2/eng" in lang_id_list:
return True

def main():
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--stats", action="store_true",
@@ -116,18 +143,16 @@ def main():
valid_records = []
records_without_subjects = []

for filename in listdir(CHUNK_DIR):
path = join(CHUNK_DIR, filename)
with open(path) as f:
content = f.read()
with open(NWBIB_FILE) as input_file:
for line in input_file:
try:
json_dicts = json.loads(content)
json_dict = json.loads(line)
except json.decoder.JSONDecodeError as jsond:
print("Could not read from file {}: {}".format(path, jsond))
print("Could not read from file {}: {}".format(line, jsond))
continue
for record in json_dicts:
data = extract_data(record)
if data["subjects"]:
data = extract_data(json_dict)
if data is not None:
if data["subjects"] and filter_language(data):
valid_records.append(data)
else:
records_without_subjects.append(data)
@@ -136,7 +161,7 @@ def main():
print(str(stats["total_records"]) + " records processed")
if args.stats:
# collect statistical data
for key in record.keys():
for key in json_dict.keys():
if key not in stats["record_keys_distribution"]:
stats["record_keys_distribution"][key] = 1
else: