From 63ebb76e19b52f7f420444213008892a81584671 Mon Sep 17 00:00:00 2001 From: Quang Date: Thu, 9 Jan 2025 17:00:57 +0700 Subject: [PATCH] Refactor PDFUploader tests for clarity and add new test cases for upload scenarios --- dsst_etl/upload_pdfs.py | 2 +- tests/test_pdf_uploader.py | 116 ++++++++++++++++++++++++++----------- 2 files changed, 84 insertions(+), 34 deletions(-) diff --git a/dsst_etl/upload_pdfs.py b/dsst_etl/upload_pdfs.py index 59c16da..d26ccdf 100644 --- a/dsst_etl/upload_pdfs.py +++ b/dsst_etl/upload_pdfs.py @@ -174,7 +174,7 @@ def run_uploader( if not pdf_files: logger.warning(f"No PDF files found in {pdf_directory_path}") - return + return [], [] with open(metadata_json_file_path) as f: metadata = json.load(f) diff --git a/tests/test_pdf_uploader.py b/tests/test_pdf_uploader.py index b539d19..396e1dc 100644 --- a/tests/test_pdf_uploader.py +++ b/tests/test_pdf_uploader.py @@ -14,67 +14,117 @@ def setUp(self, mock_boto_client): super().setUp() self.mock_s3_client = MagicMock() mock_boto_client.return_value = self.mock_s3_client - # Initialize PDFUploader with the session self.uploader = PDFUploader(self.session) + self.base_dir = Path(__file__).resolve().parent + self.pdf_paths = f"{self.base_dir}/pdf-test" + self.metadata_json_file_path = f"{self.base_dir}/pdf-test/metadata.json" - def test_run_uploader(self): - # Test that the uploader runs without errors + def test_run_uploader_successful_uploads(self): + """Test that the uploader runs and uploads files successfully.""" self.mock_s3_client.upload_file.return_value = None - base_dir = Path(__file__).resolve().parent - - pdf_paths = f"{base_dir}/pdf-test" - metadata_json_file_path = f"{base_dir}/pdf-test/metadata.json" - - susccessful_uploads, failed_uploads = self.uploader.run_uploader( - pdf_directory_path=pdf_paths, - metadata_json_file_path=metadata_json_file_path, + successful_uploads, failed_uploads = self.uploader.run_uploader( + pdf_directory_path=self.pdf_paths, + metadata_json_file_path=self.metadata_json_file_path, comment="Test upload comment", ) - self.assertEqual(len(susccessful_uploads), 2) + self.assertEqual(len(successful_uploads), 2) self.assertEqual(len(failed_uploads), 0) - self.mock_s3_client.upload_file.assert_called() - self.assertEqual(self.session.query(Identifier).count(), 2) + def test_identifiers_created(self): + """Test that identifiers are created correctly.""" + self.test_run_uploader_successful_uploads() identifiers = self.session.query(Identifier).all() - self.assertEqual("10.1038/s41586-023-06521-1" in [identifier.doi for identifier in identifiers], True) - self.assertEqual("10.1038/s41586-023-06521-2" in [identifier.doi for identifier in identifiers], True) - - self.assertEqual(34567890 in [identifier.pmid for identifier in identifiers], True) - self.assertEqual(45678901 in [identifier.pmid for identifier in identifiers], True) - - self.assertEqual("PMC9876543" in [identifier.pmcid for identifier in identifiers], True) - self.assertEqual("PMC1234567" in [identifier.pmcid for identifier in identifiers], True) - + self.assertEqual(len(identifiers), 2) + self.assertIn("10.1038/s41586-023-06521-1", [identifier.doi for identifier in identifiers]) + self.assertIn("10.1038/s41586-023-06521-2", [identifier.doi for identifier in identifiers]) + self.assertIn(34567890, [identifier.pmid for identifier in identifiers]) + self.assertIn(45678901, [identifier.pmid for identifier in identifiers]) + self.assertIn("PMC9876543", [identifier.pmcid for identifier in identifiers]) + self.assertIn("PMC1234567", [identifier.pmcid for identifier in identifiers]) + + def test_documents_created(self): + """Test that documents are created correctly.""" + self.test_run_uploader_successful_uploads() documents = self.session.query(Documents).all() self.assertEqual(len(documents), 2) + self.assertIn("s3://osm-pdf-uploads/pdfs/test2.pdf", [document.s3uri for document in documents]) + self.assertIn("s3://osm-pdf-uploads/pdfs/test1.pdf", [document.s3uri for document in documents]) - self.assertEqual("s3://osm-pdf-uploads/pdfs/test2.pdf" in [document.s3uri for document in documents], True) - self.assertEqual("s3://osm-pdf-uploads/pdfs/test1.pdf" in [document.s3uri for document in documents], True) + def test_provenance_created(self): + """Test that provenance is created correctly.""" + self.test_run_uploader_successful_uploads() provenance = self.session.query(Provenance).first() self.assertEqual(provenance.comment, "Test upload comment") self.assertEqual(provenance.pipeline_name, "Document Upload") - + + def test_documents_linked_to_provenance(self): + """Test that documents are linked to the correct provenance.""" + self.test_run_uploader_successful_uploads() + + documents = self.session.query(Documents).all() + provenance = self.session.query(Provenance).first() self.assertEqual(documents[0].provenance_id, provenance.id) self.assertEqual(documents[1].provenance_id, provenance.id) + def test_works_created(self): + """Test that works are created correctly.""" + self.test_run_uploader_successful_uploads() + works = self.session.query(Works).all() + documents = self.session.query(Documents).all() self.assertEqual(len(works), 2) - self.assertEqual(works[0].initial_document_id in [document.id for document in documents], True) - self.assertEqual(works[1].initial_document_id in [document.id for document in documents], True) + self.assertIn(works[0].initial_document_id, [document.id for document in documents]) - self.assertEqual(works[0].primary_document_id in [document.id for document in documents], True) - self.assertEqual(works[1].primary_document_id in [document.id for document in documents], True) + def test_empty_pdf_directory(self): + """Test that the uploader handles an empty PDF directory gracefully.""" + self.mock_s3_client.upload_file.return_value = None + + empty_pdf_dir = f"{self.base_dir}/empty-pdf-test" + successful_uploads, failed_uploads = self.uploader.run_uploader( + pdf_directory_path=empty_pdf_dir, + metadata_json_file_path=self.metadata_json_file_path, + comment="Test upload comment", + ) + + self.assertEqual(len(successful_uploads), 0) + self.assertEqual(len(failed_uploads), 0) + + def test_invalid_metadata_json(self): + """Test that the uploader handles invalid metadata JSON files.""" + self.mock_s3_client.upload_file.return_value = None - self.assertEqual(works[0].provenance_id, provenance.id) - self.assertEqual(works[1].provenance_id, provenance.id) + invalid_metadata_json_file_path = f"{self.base_dir}/pdf-test/invalid_metadata.json" + with self.assertRaises(FileNotFoundError): + self.uploader.run_uploader( + pdf_directory_path=self.pdf_paths, + metadata_json_file_path=invalid_metadata_json_file_path, + comment="Test upload comment", + ) + + def test_partial_upload_failures(self): + """Test that the uploader correctly reports partial upload failures.""" + def mock_upload_file(Bucket, Key, Filename): + if "test1.pdf" in Filename: + raise Exception("Upload failed for test1.pdf") + return None + + self.mock_s3_client.upload_file.side_effect = mock_upload_file + + successful_uploads, failed_uploads = self.uploader.run_uploader( + pdf_directory_path=self.pdf_paths, + metadata_json_file_path=self.metadata_json_file_path, + comment="Test upload comment", + ) + self.assertEqual(len(successful_uploads), 1) + self.assertEqual(len(failed_uploads), 1) if __name__ == "__main__": - unittest.main() + unittest.main() \ No newline at end of file