Skip to content

Commit

Permalink
Refactor metadata indexing, extend provenance tracking
Browse files Browse the repository at this point in the history
  • Loading branch information
marijnkoolen committed Jul 19, 2022
1 parent 3f7a0ac commit 8e4b520
Show file tree
Hide file tree
Showing 8 changed files with 103 additions and 28 deletions.
39 changes: 34 additions & 5 deletions do_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from republic.model.inventory_mapping import get_inventories_by_year, get_inventory_by_num
from republic.model.republic_text_annotation_model import make_session_text_version
import republic.model.republic_document_model as rdm
import republic.model.resolution_phrase_model as rpm

import republic.parser.logical.pagexml_session_parser as session_parser
import republic.parser.pagexml.republic_pagexml_parser as pagexml_parser
Expand Down Expand Up @@ -168,7 +169,8 @@ def do_session_lines_indexing(inv_num: int, year: int):
date_string = match.string
print('\tdate string:', date_string)
try:
prov_url = rep_es.post_provenance(source_ids, [session.id], 'pages', 'session_lines')
prov_url = rep_es.post_provenance(source_ids=source_ids, target_ids=[session.id],
source_index='pages', target_index='session_lines')
session.metadata['prov_url'] = prov_url
rep_es.index_session_with_lines(session)
except ElasticsearchException as error:
Expand Down Expand Up @@ -205,6 +207,9 @@ def do_resolution_indexing(inv_num: int, year: int):
for resolution in res_parser.get_session_resolutions(session, opening_searcher,
verb_searcher,
line_break_detector=line_break_detector):
prov_url = rep_es.post_provenance(source_ids=[session.id], target_ids=[resolution.id],
source_index='session_lines', target_index='resolutions')
resolution.metadata['prov_url'] = prov_url
rep_es.index_resolution(resolution)
except (TypeError, KeyError) as err:
errors.append(err)
Expand Down Expand Up @@ -232,7 +237,31 @@ def do_resolution_phrase_match_indexing(inv_num: int, year: int):


def do_resolution_metadata_indexing(inv_num: int, year: int):
print(f"Indexing PageXML resolution phrase matches for inventory {inv_num} (year {year})...")
print(f"Indexing PageXML resolution metadata for inventory {inv_num} (year {year})...")
searcher = res_parser.make_resolution_phrase_model_searcher()
relative_path = rpm.__file__.split("republic-project/")[-1]
repo_url = 'https://github.com/HuygensING/republic-project'
phrase_file = f'{repo_url}/blob/{get_commit_version()}/{relative_path}'
prop_searchers = extract_res.generate_proposition_searchers()
for resolution in rep_es.scroll_inventory_resolutions(inv_num):
phrase_matches = extract_res.extract_paragraph_phrase_matches(resolution.paragraphs[0],
[searcher])
new_resolution = extract_res.add_resolution_metadata(resolution, phrase_matches,
prop_searchers['template'],
prop_searchers['variable'])
session_id = resolution.metadata['source_id']
prov_url = rep_es.post_provenance(source_ids=[session_id], target_ids=[resolution.id],
source_index='session_lines', target_index='resolutions',
source_external_urls=[phrase_file],
why='Enriching resolution with metadata derived from resolution phrases')
if isinstance(new_resolution.metadata['prov_url'], str):
new_resolution.metadata['prov_url'] = [new_resolution.metadata['prov_url']]
new_resolution.metadata['prov_url'].append(prov_url)
rep_es.index_resolution(new_resolution)


def do_resolution_metadata_indexing_old(inv_num: int, year: int):
print(f"Indexing PageXML resolution metadata for inventory {inv_num} (year {year})...")
prop_searchers = extract_res.generate_proposition_searchers()
# proposition_searcher, template_searcher, variable_matcher = generate_proposition_searchers()
skip_formulas = {
Expand Down Expand Up @@ -260,8 +289,6 @@ def do_resolution_metadata_indexing(inv_num: int, year: int):
new_resolution = extract_res.add_resolution_metadata(resolution, phrase_matches,
prop_searchers['template'],
prop_searchers['variable'])
if 'proposition_type' not in new_resolution.metadata or new_resolution.metadata['proposition_type'] is None:
new_resolution.metadata['proposition_type'] = 'unknown'
if not new_resolution:
no_new += 1
continue
Expand All @@ -278,7 +305,9 @@ def do_resolution_metadata_indexing(inv_num: int, year: int):

def do_inventory_attendance_list_indexing(inv_num: int, year: int):
print(f"Indexing attendance lists with spans for inventory {inv_num} (year {year})...")
att_spans_year = run_attendancelist.run(rep_es.es_anno, year, outdir=None, verbose=True, tofile=False)
att_spans_year = run_attendancelist.run(rep_es.es_anno, year, outdir=None,
verbose=True, tofile=False,
source_index=rep_es.config['resolutions_index'])
if att_spans_year is None:
return None
for span_list in att_spans_year:
Expand Down
10 changes: 3 additions & 7 deletions republic/elastic/attendancelist_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
es_api_version = elasticsearch.__version__


def make_presentielijsten(es: Elasticsearch, year: int, index: str = 'session_text'):
def make_presentielijsten(es: Elasticsearch, year: int, index: str):
return get_presentielijsten(es=es, year=year, index=index)


Expand All @@ -28,15 +28,11 @@ def query_es(es: Elasticsearch, index, query, size=10, sort=None, aggs=None):
return es.search(index=index, query=query, size=size, sort=sort, aggs=aggs)


def get_presentielijsten(es: Elasticsearch, year: int, index: str = 'session_text'):
if index == 'resolutions':
type_field = 'metadata.type.keyword'
else:
type_field = 'annotations.metadata.type.keyword'
def get_presentielijsten(es: Elasticsearch, year: int, index: str):
query = {
"bool": {
"must": [
{"term": {type_field: "attendance_list"}},
{"term": {'metadata.type.keyword': "attendance_list"}},
{"term": {"metadata.session_year": year}}]
}
}
Expand Down
6 changes: 4 additions & 2 deletions republic/elastic/republic_elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,12 @@ def __init__(self, es_anno: Elasticsearch, es_text: Elasticsearch, config: dict,
self.es_anno_config = set_elasticsearch_config(host_type)

def post_provenance(self, source_ids: List[str], target_ids: List[str], source_index: str,
target_index: str, source_es_url: str = None):
target_index: str, source_es_url: str = None,
source_external_urls: List[str] = None, why: str = None):
data = make_provenance_data(es_config=self.es_anno_config, source_ids=source_ids,
target_ids=target_ids, source_index=source_index,
target_index=target_index, source_es_url=source_es_url)
target_index=target_index, source_es_url=source_es_url,
source_external_urls=source_external_urls, why=why)
response = requests.post(settings.prov_host_url, data=data,
headers={'Authorization': f'Basic: {settings.prov_api_key}'})
if response.status_code == 201:
Expand Down
8 changes: 4 additions & 4 deletions republic/elastic/republic_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from republic.model.republic_date import RepublicDate
from republic.helper.metadata_helper import get_per_page_type_index
from republic.helper.annotation_helper import make_match_hash_id
from republic.helper.utils import get_iso_utc_timestamp, get_commit_version
from republic.helper.utils import get_iso_utc_timestamp, get_commit_url


def add_timestamp(doc: Union[Dict[str, any], pdm.StructureDoc]) -> None:
Expand All @@ -28,12 +28,12 @@ def add_timestamp(doc: Union[Dict[str, any], pdm.StructureDoc]) -> None:

def add_commit(doc: Union[Dict[str, any], pdm.StructureDoc]) -> None:
if isinstance(doc, pdm.StructureDoc):
doc.metadata['code_commit'] = get_commit_version()
doc.metadata['code_commit'] = get_commit_url()
elif "metadata" not in doc and "inventory_uuid" in doc:
# datetime.datetime.now().isoformat()
doc["code_commit"] = get_commit_version()
doc["code_commit"] = get_commit_url()
else:
doc['metadata']['code_commit'] = get_commit_version()
doc['metadata']['code_commit'] = get_commit_url()


def get_pagexml_page_type(page: Union[pdm.PageXMLPage, Dict[str, any]],
Expand Down
11 changes: 11 additions & 0 deletions republic/extraction/extract_resolution_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,15 @@ def add_variable_phrases(self, template_match: TemplateMatch,
return extended_elements


def extract_paragraph_phrase_matches(paragraph: RepublicParagraph,
searchers: List[FuzzyPhraseSearcher]) -> List[PhraseMatch]:
matches = []
for searcher in searchers:
doc = {'id': paragraph.id, 'text': paragraph.text}
matches += searcher.find_matches(doc)
return matches


def get_paragraph_phrase_matches(rep_es: RepublicElasticsearch,
resolution: Resolution) -> List[PhraseMatch]:
opening_para = resolution.paragraphs[0]
Expand Down Expand Up @@ -417,6 +426,8 @@ def add_resolution_metadata(resolution: Resolution, phrase_matches: List[PhraseM
if label.startswith('proposition_type'):
resolution.metadata['proposition_type'] = label.split(':')[1]
resolution.metadata = add_proposer_metadata(resolution, resolution.metadata)
if 'proposition_type' not in resolution.metadata or resolution.metadata['proposition_type'] is None:
resolution.metadata['proposition_type'] = 'unknown'
# print('resolution metadata:')
# print(json.dumps(resolution.metadata, indent=2))
return resolution
Expand Down
18 changes: 14 additions & 4 deletions republic/helper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,26 +11,36 @@ def get_commit_version():
return completed_process.stdout.strip() if completed_process is not None else None


def get_commit_url():
commit_version = get_commit_version()
return f'https://github.com/HuygensING/republic-project/commit/{commit_version}'


def get_iso_utc_timestamp() -> str:
return datetime.datetime.now(datetime.timezone.utc).isoformat().replace('+00:00', 'Z')


def make_provenance_data(es_config, source_ids: List[str], target_ids: List[str],
source_index: str, target_index: str, source_es_url: str = None) -> Dict[str, any]:
source_index: str, target_index: str, source_es_url: str = None,
source_external_urls: List[str] = None, why: str = None) -> Dict[str, any]:
if source_es_url is None:
source_es_url = es_config['elastic_config']['url']
target_es_url = es_config['elastic_config']['url']
source_urls = [f'{source_es_url}{source_index}/_doc/{source_id}' for source_id in source_ids]
if source_external_urls is not None:
source_urls += source_external_urls
target_urls = [f'{target_es_url}{target_index}/_doc/{target_id}' for target_id in target_ids]
source_rels = ['primary'] * len(source_urls)
target_rels = ['primary'] * len(target_urls)
commit_version = get_commit_version()
commit_url = get_commit_url()
if why is None:
why = f'REPUBLIC CAF Pipeline deriving {target_index} from {source_index}'
return {
'who': 'orcid:0000-0002-0301-2029',
'where': 'https://annotation.republic-caf.diginfra.org/',
'when': get_iso_utc_timestamp(),
'how': f'https://github.com/HuygensING/republic-project/commit/{commit_version}',
'why': f'REPUBLIC CAF Pipeline deriving {target_index} from {source_index}',
'how': commit_url,
'why': why,
'source': source_urls,
'source_rel': source_rels,
'target': target_urls,
Expand Down
29 changes: 27 additions & 2 deletions republic/parser/logical/pagexml_resolution_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,25 @@ def get_resolution_text_page_nums(res_doc: Union[rdm.Resolution, rdm.AttendanceL
return sorted(list(text_page_nums))


def get_resolution_page_nums(res_doc: Union[rdm.Resolution, rdm.AttendanceList]) -> List[int]:
page_nums = set()
for para in res_doc.paragraphs:
for page_num in para.metadata["page_num"]:
if isinstance(page_num, int):
page_nums.add(page_num)
return sorted(list(page_nums))


def get_resolution_page_ids(res_doc: Union[rdm.Resolution, rdm.AttendanceList]) -> List[int]:
page_ids = set()
for para in res_doc.paragraphs:
if 'page_ids' not in para.metadata:
continue
for page_id in para.metadata["page_ids"]:
page_ids.add(page_id)
return sorted(list(page_ids))


def map_alt_langs(langs):
alt_langs = {
# Dutch is often confused with
Expand Down Expand Up @@ -219,12 +238,16 @@ def get_session_resolutions(session: rdm.Session, opening_searcher: FuzzyPhraseS
elif len(opening_matches) > 0:
if attendance_list:
attendance_list.metadata["text_page_num"] = get_resolution_text_page_nums(attendance_list)
attendance_list.metadata["page_num"] = get_resolution_page_nums(attendance_list)
attendance_list.metadata["page_ids"] = get_resolution_page_ids(attendance_list)
yield attendance_list
attendance_list = None
resolution_number += 1
if resolution:
resolution.set_proposition_type()
resolution.metadata["text_page_num"] = get_resolution_text_page_nums(resolution)
resolution.metadata["page_num"] = get_resolution_page_nums(resolution)
resolution.metadata["page_ids"] = get_resolution_page_ids(resolution)
yield resolution
metadata = get_base_metadata(session, generate_id(), 'resolution')
resolution = rdm.Resolution(doc_id=metadata['id'], metadata=metadata,
Expand All @@ -247,6 +270,8 @@ def get_session_resolutions(session: rdm.Session, opening_searcher: FuzzyPhraseS
resolution.metadata['lang'] = list({para.metadata['lang'] for para in resolution.paragraphs})
resolution.set_proposition_type()
resolution.metadata["text_page_num"] = get_resolution_text_page_nums(resolution)
resolution.metadata["page_num"] = get_resolution_page_nums(resolution)
resolution.metadata["page_ids"] = get_resolution_page_ids(resolution)
yield resolution


Expand All @@ -267,7 +292,8 @@ def get_base_metadata(source_doc: rdm.RepublicDoc, doc_id: str, doc_type: str) -
'inventory_num': source_doc.metadata['inventory_num'],
'source_id': source_doc.metadata['id'],
'type': doc_type,
'id': doc_id
'id': doc_id,
'page_ids': []
}
if doc_type in ["resolution", "attendance_list"]:
metadata['session_date'] = source_doc.metadata['session_date']
Expand All @@ -279,7 +305,6 @@ def get_base_metadata(source_doc: rdm.RepublicDoc, doc_id: str, doc_type: str) -
metadata['session_month'] = source_doc.metadata['session_month']
metadata['session_day'] = source_doc.metadata['session_day']
metadata['session_weekday'] = source_doc.metadata['session_weekday']
metadata['page_ids'] = []
return metadata


Expand Down
10 changes: 6 additions & 4 deletions run_attendancelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,9 @@ def prepare_found_delegates(framed_gtlm, found_delegates, year):
return framed_gtlm


def run(es: Elasticsearch, year=0, outdir='', tofile=True, verbose=True):
runner = RunAll(es=es, year=year)
def run(es: Elasticsearch, year=0, outdir='', tofile=True, verbose=True,
source_index: str = 'resolutions'):
runner = RunAll(es=es, year=year, source_index=source_index)
if verbose:
print("- gathering attendance lists")
if len(runner.searchobs) == 0:
Expand Down Expand Up @@ -151,11 +152,12 @@ def __init__(self, es: Elasticsearch,
found_delegates=found_delegates,
matchfnd=matchfinder,
ekwz=ekwz,
outdir=''
outdir='',
source_index: str = 'resolutions'
):
start_logger(outdir, year)
self.year = year
self.searchobs = make_presentielijsten(es=es, year=self.year, index='resolutions')
self.searchobs = make_presentielijsten(es=es, year=self.year, index=source_index)
logging.info(f'year: {year}, nr of attendancelists {len(self.searchobs)}')
self.junksweeper = make_junksweeper(ekwz)
self.abbreviated_delegates = abbreviated_delegates
Expand Down

0 comments on commit 8e4b520

Please sign in to comment.