Knox-AAU · Stormlighter · Oct 18, 2023 · Nov 17, 2023 · Nov 17, 2023 · Nov 17, 2023
diff --git a/FileLoader_UnitTest.py b/FileLoader_UnitTest.py
@@ -0,0 +1,13 @@
+import os
+import unittest
+
+TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "readme.md")
+
+
+class MyTest(unittest.TestCase):
+    def setUp(self):
+        self.testfile = open(TESTDATA_FILENAME).read()
+        self.testdata = self.testfile.read()
+
+    def tearDown(self):
+        self.testfile.close()
diff --git a/src/file_loading/file_loader.py b/src/file_loading/file_loader.py
@@ -6,27 +6,56 @@
 from pdf2image.exceptions import PDFPopplerTimeoutError, PDFSyntaxError
 from file_loading.extension_checker import ExtChecker
 
+
 @dataclasses.dataclass
 class ImageStructure:
     """Structure of the images we want to give to the next step"""
+
     def __init__(self) -> None:
+        # more can be added if we want to save more information about the image loaded
+
         self.image = None
         self.file_name = None
         self.index = None
 
+
 @dataclasses.dataclass
 class FileLoader:
-    """ handling of files """
+    """handling of files"""
 
     def __init__(self):
         self.extension = None
         self.path = None
         self.output_folder = ""
         self.last_load_status = False
+        self.finished_loading = False
+
+    def readextension(self, string):
+        """Reads the extension of the file and selects method to be used"""
+        self.path, self.extension = os.path.splitext(string)
+
 
     def openpdf(self):
         """loads pdf"""
         try:
+            images = pdf2image.convert_from_path(self.path + self.extension)
+            self.last_load_status = True
+        except (NotImplementedError, PDFPopplerTimeoutError, PDFSyntaxError):
+            self.last_load_status = False
+        for image in images:
+            ims = ImageStructure()
+            ims.file_name = os.path.basename(self.path + self.extension)
+            ims.image = image
+            ims.format = image.format
+            ims.size = image.size
+            ims.mode = image.mode
+            self.images.append(ims)
+
+    def openimage(self):
+        """loads images"""
+        try:
+            with Image.open(self.path + self.extension) as image:
+                self.last_load_status = True
             images = pdf2image.convert_from_path(self.path)
             for index, image in enumerate(images):
                 ims = ImageStructure()
@@ -36,6 +65,7 @@ def openpdf(self):
                 self._output_file(ims)
 
             self.last_load_status = True
+
         except (NotImplementedError, PDFPopplerTimeoutError, PDFSyntaxError):
             self.last_load_status = False
 
@@ -52,6 +82,12 @@ def _remove_file(self):
                 os.remove(self.path)
 
     def handle_files(self, read_file):
+
+        """remake me"""
+        output_folder = "/watched/text_extraction/"
+        output_file_path = (
+            output_folder + "out_" + str(read_file).rsplit("/", maxsplit=1)[-1]
+        )
         """ handles calling file_loader functions when told by folder_watcher """
 
         extension_checker = ExtChecker()

diff --git a/src/file_loading/test/PDF_test1.pdf b/src/file_loading/test/PDF_test1.pdf
diff --git a/src/file_loading/test/test_file_loader_text_extraction.py b/src/file_loading/test/test_file_loader_text_extraction.py
@@ -0,0 +1,33 @@
+import unittest
+from file_loading.file_loader import FileLoader
+from text_extraction.text_extractor import TextExtractor
+
+
+class Test_File_Loader_Text_Extraction_copy(unittest.TestCase):
+    def setUp(self):
+        self.pdf_file_path = "src/file_loading/test/PDF_test1.pdf"
+
+    def test_file_loader_and_text_extractor_integration(self):
+        # Arrange
+        file_loader = FileLoader()
+        text_extractor = TextExtractor()
+
+        # Act
+        file_loader.readextension(self.pdf_file_path)
+        file_loader.openpdf()
+        print(file_loader.images[0].file_name)
+        text_extractor.read(self.pdf_file_path)
+
+        # Assert
+        # Verify that loaded image file is created by FileLoader
+        self.assertTrue(file_loader.last_load_status)
+        self.assertEqual(len(file_loader.images), 1)
+        self.assertEqual(file_loader.images[0].file_name, "PDF_test1.pdf")
+        self.assertEqual(file_loader.images[0].format, "PPM")
+
+        # Verify that the text is extracted by TextExtractor
+        output_file_path = text_extractor.out_dir + "PDF_test1.pdf"
+        with open(output_file_path, "r", encoding="utf-8") as output_file:
+            content = output_file.read()
+            content = content.strip('\n')
+            self.assertIn("word", content)
diff --git a/src/text_extraction/text_extractor.py b/src/text_extraction/text_extractor.py
@@ -5,28 +5,85 @@
 from PIL import Image
 import pytesseract
 
+
+# PDF_file = Path(r"./testdata/test5.pdf")
+
 @dataclasses.dataclass
-class TextExtractor():
-    """ Text extraction interface """
+class TextExtractor:
+    """Text extraction interface"""
+
     def __init__(self):
         self.out_dir = ""
         self.dpi = 500
 
     def read(self, input_file):
+        """Inner function that converts and reads PDFs"""
+        # Part #1 : Converting PDF to images
+
+        out_path = f"{self.out_dir}{input_file.split('/')[-1]}"
+
+        with open(out_path, "w", encoding="utf-8") as outfile_handle:
+            with TemporaryDirectory() as tempdir:
+                print("Converting path")
+                # Converts the input file to a list of pages
+                pdf_pages = convert_from_path(input_file, self.dpi)
+                print("Converted path to file")
+
+                # Loop over all the pages found above -> enumerate counts the pages for us
+                for page_enumeration, page in enumerate(pdf_pages, start=1):
+                    # Create a file name to store the image
+                    filename = f"{tempdir}/page_{page_enumeration:03}.jpg"
+                    print("Reading file", filename)
+
+                    # Declaring filename for each page of PDF as JPG
+                    # PDF page 1 -> page_001.jpg
+                    # PDF page 2 -> page_002.jpg
+
+                    # Save the image of the page in system
+                    page.save(filename, "JPEG")
+                    self.image_file_list.append(filename)
+                    print("Creating file " + filename)
+
         """ Inner function that reads images and outputs the OCR text"""
 
+
         out_path = f"{self.out_dir}{re.sub(r'[^.]+$', 'txt', os.path.basename(input_file))}"
 
+                    # Iterate from 1 to total number of pages
+                    for image_file in self.image_file_list:
+                        # Set filename to recognize text from
+                        # Again, these files will be:
+                        # page_1.jpg
+                        # page_2.jpg
+
+                        # Recognize the text as string in image using pytesserct
+                        print("Reading file" + image_file)
+                        text = str(
+                            ((pytesseract.image_to_string(Image.open(image_file))))
+                        )
+
         text = str(((pytesseract.image_to_string(Image.open(input_file),lang="dan"))))
 
         print("Reading file" + input_file)
 
         text = text.replace("-\n", "")
 
+
         # Save each sentence as a new line in the output file
         with open(out_path, 'w', encoding='utf-8') as file:
             print(text)
             file.write(text)
 
+
+                        # Put the read data in the queue for the further steps to handle
+                        for word in text.split(" "):
+                            word = word.lower()
+                            word = word.strip()
+                            print(word)
+                            # Here the word is split to only have a single word
+                            # enter the queue at a time and not the whole text
+                            outfile_handle.write(word + "\n")
+
         if os.path.exists(input_file):
             os.remove(input_file)
+