From 1838fb5ebfd00329f2a05a95ce98dbbd8e171259 Mon Sep 17 00:00:00 2001 From: Saksham Arora Date: Tue, 23 Jan 2024 14:53:11 +0100 Subject: [PATCH] importer: Update tests for normalized title matching --- cds_ils/importer/documents/api.py | 4 +++- cds_ils/importer/documents/importer.py | 16 +++++++++++++--- tests/importer/test_document_matching.py | 1 + 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/cds_ils/importer/documents/api.py b/cds_ils/importer/documents/api.py index ff63e5b84..34101d86d 100644 --- a/cds_ils/importer/documents/api.py +++ b/cds_ils/importer/documents/api.py @@ -40,7 +40,9 @@ def search_document_by_title_authors(title, authors, subtitle=None): """Find document by title and authors.""" document_search = current_app_ils.document_search_cls() - title = " ".join(title.lower().split()) # Normalized title search for documents + title = " ".join( + title.lower().split() + ).strip() # Normalized title search for documents if subtitle: search = ( document_search.filter("term", title__normalized_keyword=title) diff --git a/cds_ils/importer/documents/importer.py b/cds_ils/importer/documents/importer.py index 754f9bd6d..cf735b862 100644 --- a/cds_ils/importer/documents/importer.py +++ b/cds_ils/importer/documents/importer.py @@ -199,6 +199,12 @@ def _update_field_alternative_identifiers(self, matched_document): return existing_identifiers + new_identifiers + @staticmethod + def _normalize_title(title): + """Return a normalized title.""" + t = " ".join(title.lower().split()) + return t.strip() + def update_document(self, matched_document): """Update document record.""" for field in self.update_document_fields: @@ -263,11 +269,13 @@ def _validate_volumes(self, existing_document): for import_serial in import_doc_serials: import_volume = import_serial.get("volume") - import_serial_title = import_serial["title"].lower() + import_serial_title = self._normalize_title(import_serial["title"]) for serial in existing_doc_serials: existing_volume = serial.get("volume") - existing_title = serial["record_metadata"]["title"].lower() + existing_title = self._normalize_title( + serial["record_metadata"]["title"] + ) same_serial = existing_title == import_serial_title both_have_volumes = import_volume and existing_volume @@ -322,7 +330,9 @@ def validate_found_matches(self, not_validated_matches): ) pub_year_not_equal = doc_pub_year != import_doc_publication_year - titles_not_equal = document_title.lower() != import_doc_title.lower() + titles_not_equal = self._normalize_title( + document_title + ) != self._normalize_title(import_doc_title) provider_identifiers_not_equal = ( not self._validate_provider_identifiers(document) diff --git a/tests/importer/test_document_matching.py b/tests/importer/test_document_matching.py index cee04fe3a..e6d8bbeec 100644 --- a/tests/importer/test_document_matching.py +++ b/tests/importer/test_document_matching.py @@ -78,6 +78,7 @@ def test_document_search_matching(importer_test_data): validated_matches, partial = document_importer.validate_found_matches(matches) assert validated_matches == "docid-4" + assert partial == ["docid-41"] def test_fuzzy_matching(importer_test_data):