From 218f33d074d8bf2636650498ad8a7606ecf6a2bc Mon Sep 17 00:00:00 2001 From: Kial Date: Thu, 26 Sep 2024 11:37:48 -0400 Subject: [PATCH] Importer - updates for btr changes (#280) Signed-off-by: Kial Jinnah --- search-solr-importer/data_import_handler.py | 6 ++- .../utils/data_collection.py | 10 ++++- .../utils/data_parsing.py | 38 +++++++++---------- 3 files changed, 32 insertions(+), 22 deletions(-) diff --git a/search-solr-importer/data_import_handler.py b/search-solr-importer/data_import_handler.py index 8a91d48..6cd60b9 100644 --- a/search-solr-importer/data_import_handler.py +++ b/search-solr-importer/data_import_handler.py @@ -97,6 +97,7 @@ def load_search_core(): # pylint: disable=too-many-statements,too-many-locals,t current_app.logger.debug('---------- Collecting/Importing BTR Data ----------') btr_fetch_count = 0 batch_limit = current_app.config.get('BTR_BATCH_LIMIT') + btr_data_descs = [] loop_count = 0 while loop_count < 100: # NOTE: should never get to this condition @@ -110,7 +111,10 @@ def load_search_core(): # pylint: disable=too-many-statements,too-many-locals,t break current_app.logger.debug('********** Mapping BTR data **********') - prepped_btr_data = prep_data_btr(btr_data) + if not btr_data_descs: + # just need to do once + btr_data_descs = [desc[0].lower() for desc in btr_data_cur.description] + prepped_btr_data = prep_data_btr(btr_data, btr_data_descs) current_app.logger.debug(f'{len(prepped_btr_data)} BTR records ready for import.') current_app.logger.debug('********** Importing BTR entities **********') diff --git a/search-solr-importer/src/search_solr_importer/utils/data_collection.py b/search-solr-importer/src/search_solr_importer/utils/data_collection.py index cde0d12..04278f1 100644 --- a/search-solr-importer/src/search_solr_importer/utils/data_collection.py +++ b/search-solr-importer/src/search_solr_importer/utils/data_collection.py @@ -145,7 +145,7 @@ def collect_btr_data(limit: int = None, offset: int = None): limit_clause += f' OFFSET {offset}' if limit_clause: # NOTE: needed in order to make sure we get every record when doing batch loads - limit_clause = f'ORDER BY id {limit_clause}' + limit_clause = f'ORDER BY p.id {limit_clause}' current_app.logger.debug('Connecting to BTR GCP Postgres instance...') conn = psycopg2.connect(host=current_app.config.get('BTR_DB_HOST'), @@ -155,5 +155,11 @@ def collect_btr_data(limit: int = None, offset: int = None): password=current_app.config.get('BTR_DB_PASSWORD')) cur = conn.cursor() current_app.logger.debug('Collecting BTR data...') - cur.execute(f'SELECT payload FROM submission {limit_clause}') + cur.execute(f""" + SELECT s.business_identifier, p.person_json + FROM submission s + JOIN ownership o on s.id = o.submission_id + JOIN person p on p.id = o.person_id + {limit_clause} + """) return cur diff --git a/search-solr-importer/src/search_solr_importer/utils/data_parsing.py b/search-solr-importer/src/search_solr_importer/utils/data_parsing.py index e5a1286..6c35c2b 100644 --- a/search-solr-importer/src/search_solr_importer/utils/data_parsing.py +++ b/search-solr-importer/src/search_solr_importer/utils/data_parsing.py @@ -187,33 +187,33 @@ def prep_data(data: list, data_descs: list[str], source: str) -> list[dict]: # return solr_docs -def prep_data_btr(data: list[dict]) -> list[dict]: +def prep_data_btr(data: list[dict], data_descs: list[str]) -> list[dict]: """Return the list of partial business docs containing the SI party information.""" prepped_data: list[dict] = [] for item in data: - submission = item[0] - identifier = submission['businessIdentifier'] + item_dict = dict(zip(data_descs, item)) + identifier = item_dict['business_identifier'] business = {'id': identifier, 'parties': {'add': []}} # collect current SIs. - for person in submission.get('personStatements', []): - party_name = '' - for name in person.get('names'): - if name.get('type') == 'individual': # expecting this to be 'individual' or 'alternative' - party_name = name.get('fullName') - break - if not party_name: - current_app.logger.debug('Person names: %s', person.get('names')) - current_app.logger.error('Error parsing SI name for %s', identifier) - - business['parties']['add'].append({ - PartyField.UNIQUE_KEY.value: identifier + '_' + person['uuid'], - PartyField.PARTY_NAME.value: party_name, - PartyField.PARTY_ROLE.value: ['significant individual'], - PartyField.PARENT_TYPE.value: 'person' - }) + person = item_dict['person_json'] + party_name = '' + for name in person.get('names'): + if name.get('type') == 'individual': # expecting this to be 'individual' or 'alternative' + party_name = name.get('fullName') + break + if not party_name: + current_app.logger.debug('Person names: %s', person.get('names')) + current_app.logger.error('Error parsing SI name for %s', identifier) + + business['parties']['add'].append({ + PartyField.UNIQUE_KEY.value: identifier + '_' + person['statementID'], + PartyField.PARTY_NAME.value: party_name, + PartyField.PARTY_ROLE.value: ['significant individual'], + PartyField.PARENT_TYPE.value: 'person' + }) prepped_data.append(business)