Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cb/integration file loader extraction #23

Closed
wants to merge 10 commits into from
13 changes: 13 additions & 0 deletions FileLoader_UnitTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import os
import unittest

TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "readme.md")


class MyTest(unittest.TestCase):
def setUp(self):
self.testfile = open(TESTDATA_FILENAME).read()
self.testdata = self.testfile.read()

def tearDown(self):
self.testfile.close()
38 changes: 37 additions & 1 deletion src/file_loading/file_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,56 @@
from pdf2image.exceptions import PDFPopplerTimeoutError, PDFSyntaxError
from file_loading.extension_checker import ExtChecker


@dataclasses.dataclass
class ImageStructure:
"""Structure of the images we want to give to the next step"""

def __init__(self) -> None:
# more can be added if we want to save more information about the image loaded

self.image = None
self.file_name = None
self.index = None


@dataclasses.dataclass
class FileLoader:
""" handling of files """
"""handling of files"""

def __init__(self):
self.extension = None
self.path = None
self.output_folder = ""
self.last_load_status = False
self.finished_loading = False

def readextension(self, string):
"""Reads the extension of the file and selects method to be used"""
self.path, self.extension = os.path.splitext(string)


def openpdf(self):
"""loads pdf"""
try:
images = pdf2image.convert_from_path(self.path + self.extension)
self.last_load_status = True
except (NotImplementedError, PDFPopplerTimeoutError, PDFSyntaxError):
self.last_load_status = False
for image in images:
ims = ImageStructure()
ims.file_name = os.path.basename(self.path + self.extension)
ims.image = image
ims.format = image.format
ims.size = image.size
ims.mode = image.mode
self.images.append(ims)

def openimage(self):
"""loads images"""
try:
with Image.open(self.path + self.extension) as image:
self.last_load_status = True
images = pdf2image.convert_from_path(self.path)
for index, image in enumerate(images):
ims = ImageStructure()
Expand All @@ -36,6 +65,7 @@ def openpdf(self):
self._output_file(ims)

self.last_load_status = True

except (NotImplementedError, PDFPopplerTimeoutError, PDFSyntaxError):
self.last_load_status = False

Expand All @@ -52,6 +82,12 @@ def _remove_file(self):
os.remove(self.path)

def handle_files(self, read_file):

"""remake me"""
output_folder = "/watched/text_extraction/"
output_file_path = (
output_folder + "out_" + str(read_file).rsplit("/", maxsplit=1)[-1]
)
""" handles calling file_loader functions when told by folder_watcher """

extension_checker = ExtChecker()
Expand Down
Binary file added src/file_loading/test/PDF_test1.pdf
Binary file not shown.
33 changes: 33 additions & 0 deletions src/file_loading/test/test_file_loader_text_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import unittest
from file_loading.file_loader import FileLoader
from text_extraction.text_extractor import TextExtractor


class Test_File_Loader_Text_Extraction_copy(unittest.TestCase):
def setUp(self):
self.pdf_file_path = "src/file_loading/test/PDF_test1.pdf"

def test_file_loader_and_text_extractor_integration(self):
# Arrange
file_loader = FileLoader()
text_extractor = TextExtractor()

# Act
file_loader.readextension(self.pdf_file_path)
file_loader.openpdf()
print(file_loader.images[0].file_name)
text_extractor.read(self.pdf_file_path)

# Assert
# Verify that loaded image file is created by FileLoader
self.assertTrue(file_loader.last_load_status)
self.assertEqual(len(file_loader.images), 1)
self.assertEqual(file_loader.images[0].file_name, "PDF_test1.pdf")
self.assertEqual(file_loader.images[0].format, "PPM")

# Verify that the text is extracted by TextExtractor
output_file_path = text_extractor.out_dir + "PDF_test1.pdf"
with open(output_file_path, "r", encoding="utf-8") as output_file:
content = output_file.read()
content = content.strip('\n')
self.assertIn("word", content)
61 changes: 59 additions & 2 deletions src/text_extraction/text_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,85 @@
from PIL import Image
import pytesseract


# PDF_file = Path(r"./testdata/test5.pdf")

@dataclasses.dataclass
class TextExtractor():
""" Text extraction interface """
class TextExtractor:
"""Text extraction interface"""

def __init__(self):
self.out_dir = ""
self.dpi = 500

def read(self, input_file):
"""Inner function that converts and reads PDFs"""
# Part #1 : Converting PDF to images

out_path = f"{self.out_dir}{input_file.split('/')[-1]}"

with open(out_path, "w", encoding="utf-8") as outfile_handle:
with TemporaryDirectory() as tempdir:
print("Converting path")
# Converts the input file to a list of pages
pdf_pages = convert_from_path(input_file, self.dpi)
print("Converted path to file")

# Loop over all the pages found above -> enumerate counts the pages for us
for page_enumeration, page in enumerate(pdf_pages, start=1):
# Create a file name to store the image
filename = f"{tempdir}/page_{page_enumeration:03}.jpg"
print("Reading file", filename)

# Declaring filename for each page of PDF as JPG
# PDF page 1 -> page_001.jpg
# PDF page 2 -> page_002.jpg

# Save the image of the page in system
page.save(filename, "JPEG")
self.image_file_list.append(filename)
print("Creating file " + filename)

""" Inner function that reads images and outputs the OCR text"""


out_path = f"{self.out_dir}{re.sub(r'[^.]+$', 'txt', os.path.basename(input_file))}"

# Iterate from 1 to total number of pages
for image_file in self.image_file_list:
# Set filename to recognize text from
# Again, these files will be:
# page_1.jpg
# page_2.jpg

# Recognize the text as string in image using pytesserct
print("Reading file" + image_file)
text = str(
((pytesseract.image_to_string(Image.open(image_file))))
)

text = str(((pytesseract.image_to_string(Image.open(input_file),lang="dan"))))

print("Reading file" + input_file)

text = text.replace("-\n", "")


# Save each sentence as a new line in the output file
with open(out_path, 'w', encoding='utf-8') as file:
print(text)
file.write(text)


# Put the read data in the queue for the further steps to handle
for word in text.split(" "):
word = word.lower()
word = word.strip()
print(word)
# Here the word is split to only have a single word
# enter the queue at a time and not the whole text
outfile_handle.write(word + "\n")

if os.path.exists(input_file):
os.remove(input_file)

Loading