Merge pull request #19 from Knox-AAU/jc/code-cleanup

Jc/code cleanup
Knox-AAU · Nov 27, 2023 · 3d08cbe · 3d08cbe
2 parents 64c451d + 1b7d750
commit 3d08cbe
Show file tree

Hide file tree

Showing 13 changed files with 29 additions and 42 deletions.
diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml
@@ -26,6 +26,7 @@ jobs:
         uses: actions/checkout@v3
         with: 
           fetch-depth: 0
+      - run: sudo apt-get update
       - run: sudo apt-get install poppler-utils
       - run: pip install -r requirements.txt
       - run: "python3 -m unittest discover -s src -p 'test_*.py'"
diff --git a/readme.md b/readme.md
@@ -3,7 +3,7 @@
 python -m venv .venv && source .venv/bin/activate && pip install -r requirements.txt
 ```
 
-# If cool boy:
+# Helper functions (h file):
 To lint: 
 ```bash
 python h lint
@@ -29,19 +29,19 @@ python -m unittest discover -s src -p 'test_*.py'
 
 # Command to setup setuptools and fix imports etc
 ```bash
-python3 -m pip install --editable .
+python -m pip install --editable .
 ```
 
 # Docker compose commands
-
+*Sudo rights may be needed - use: " **sudo {command you want to run}** "*
 ### Build containers
 * **To build developer environment**
 ```bash
 docker compose -f docker-compose-dev.yml build
 ```
-* **To build production environment**
+* **To pull production environment**
 ```bash
-docker compose -f docker-compose-prod.yml build
+docker compose -f docker-compose-prod.yml pull
 ```
 
 ### Start containers

diff --git a/src/file_loading/file_loader.py b/src/file_loading/file_loader.py
@@ -78,13 +78,10 @@ def removefile(self):
                 os.remove(self.path + self.extension)
 
     def handle_files(self, read_file):
-        """ remake me """
+        """ File handler for moving files to text_extraction folder """
         output_folder = "/watched/text_extraction/"
         output_file_path = output_folder + "out_" + str(read_file).rsplit('/', maxsplit=1)[-1]
 
         shutil.copy(read_file, output_file_path)
 
-        # print(f"shutil copied: {read_file}")
-        # os.remove(os.path.join(path, name))
-
         print(f"Fileloader moved: {read_file} to {output_file_path}")
diff --git a/src/file_loading/main.py b/src/file_loading/main.py
@@ -1,6 +1,6 @@
 """Entrance file for the file loader step of step 1 of the Knox pipeline"""
-from file_loading.file_loader import FileLoader
 from folder_watcher.folder_watcher import FolderWatcher
+from file_loading.file_loader import FileLoader
 
 if __name__ == '__main__':
     file_loader = FileLoader()

diff --git a/src/file_loading/test/__init__.py b/src/file_loading/test/__init__.py
@@ -0,0 +1 @@
+# pylint: skip-file
diff --git a/src/file_loading/test/test_file_loader.py b/src/file_loading/test/test_file_loader.py
@@ -6,24 +6,28 @@ class TestCase(unittest.TestCase):
     """Testing the SpellChecker class"""
     def test_reads_correct_path(self):
         """ Test if the readextension function reads the correct path """
+
         # Arrange
         file_loader = FileLoader()
         file_loader.readextension('src/file_loading/test/test.pdf')
 
         # Act
         actual_path = file_loader.path
+
         # Assert
         self.assertEqual(actual_path, 'src/file_loading/test/test')
 
 
     def test_reads_correct_extension(self):
         """ Test if the readextension function reads the correct file extension """
+
         # Arrange
         file_loader = FileLoader()
         file_loader.readextension('src/file_loading/test/test.pdf')
 
         # Act
         actual_extension = file_loader.extension
+
         # Assert
         self.assertEqual(actual_extension, '.pdf')
 
@@ -33,6 +37,7 @@ def test_reads_correct_extension(self):
     # TODO: Right now it just reads from printed output :))
     def test_opens_pdf(self):
         """ Test if the openpdf function correctly opens and reads data """
+
         # Arrange
         file_loader = FileLoader()
         file_loader.readextension('src/file_loading/test/test.pdf')
@@ -46,6 +51,7 @@ def test_opens_pdf(self):
             file_loader.images[0].mode,
             file_loader.images[0].file_name,
         ]
+
         # Assert
         self.assertListEqual(actual, [
             'PPM',
@@ -57,6 +63,7 @@ def test_opens_pdf(self):
 
     def test_opens_image(self):
         """ Test if the openimage function correctly opens and reads data """
+
         # Arrange
         file_loader = FileLoader()
         file_loader.readextension('src/file_loading/test/test.jpg')
@@ -69,6 +76,7 @@ def test_opens_image(self):
             file_loader.images[0].mode,
             file_loader.images[0].file_name,
         ]
+
         # Assert
         self.assertListEqual(actual, [
             'JPEG',

diff --git a/src/folder_watcher/folder_watcher.py b/src/folder_watcher/folder_watcher.py
@@ -10,13 +10,11 @@ def __init__(self, function_to_run):
         self.function_to_run = function_to_run
 
     def on_created(self, event):
-        # print(event)
         if not event.is_directory:
             full_path = os.path.join(os.getcwd(), event.src_path)
             self.function_to_run(full_path)
 
     def on_modified(self, event):
-        # print(event)
         if not event.is_directory:
             full_path = os.path.join(os.getcwd(), event.src_path)
             self.function_to_run(full_path)
@@ -31,22 +29,21 @@ class FolderWatcher:
     :param: function_to_run is the function that should be applied to the file that is found.
     this function should start by reading the file
     """
+
     def __init__(self, path_to_watch, function_to_run):
         self.path_to_watch = path_to_watch
         self.function_to_run = function_to_run
 
     def watch(self):
-        """ This method is a wrapper function that watches the folder and 
+        """This method is a wrapper function that watches the folder and 
         applys a function to files in the folder"""
+
         watcher = _Watcher(self.function_to_run)
 
         observer = Observer()
         observer.schedule(watcher, path=self.path_to_watch, recursive=True)
         observer.start()
 
-        # Remove; only for test
-        print(f"Watching: {self.path_to_watch}\n")
-
         try:
             while True:
                 sleep(1)

diff --git a/src/spell_checking/main.py b/src/spell_checking/main.py
@@ -1,6 +1,6 @@
 """ Entrance file for the spell checking step of step 1 of the Knox pipeline """
-from spell_checking.spell_checker import SpellChecker
 from folder_watcher.folder_watcher import FolderWatcher
+from spell_checking.spell_checker import SpellChecker
 
 if __name__ == '__main__':
     spellchecker = SpellChecker("src/spell_checking/wordList.txt")

diff --git a/src/spell_checking/spell_checker.py b/src/spell_checking/spell_checker.py
@@ -99,31 +99,12 @@ def query(self, x):
         # Sort the results in reverse order and return
         return sorted(self.output, key=lambda x: x[1], reverse=True)
 
-    def handle_files_print(self, read_file):
-        """ Test """
-
-        if self.ready is True:
-            validwords = 0
-            invalidwords = 0
-
-            with open(read_file, 'r', encoding="utf-8") as reading_file:
-                for line in reading_file.readlines():
-                    for word in line.split(" "):
-                        if len(self.query(word)) > 0:
-                            validwords += 1
-                        else:
-                            invalidwords += 1
-
-            print(f"Valid words: {validwords}\nInvalid words: {invalidwords}")
-
     def handle_files(self, read_file):
-        """ Test """
+        """ Internal filehandling for spell_checker """
         if self.ready is True:
             output_folder = "/watched/output/"
             output_file_path = output_folder + str(read_file).rsplit('/', maxsplit=1)[-1]
 
-            print(output_file_path)
-
             with open(read_file, 'r', encoding="utf-8") as reading_file:
                 with open(output_file_path, 'w', encoding="utf-8") as output_file:
                     for line in reading_file.readlines():

diff --git a/src/spell_checking/test/test_spell_checker.py b/src/spell_checking/test/test_spell_checker.py
@@ -6,19 +6,21 @@ class TestCase(unittest.TestCase):
     """Testing the SpellChecker class"""
     def test_trie_initializes(self):
         """ Test if the Trie is correctly initialized """
+
         # Arrange
         sc = SpellChecker()
 
-        # Act
         # Assert
         actual = sc.root.char == ""
         self.assertEqual(actual, True)
 
 
     def test_trie_inserts_correctly(self):
         """ Test if the insert function works """
+
         # Arrange
         sc = SpellChecker()
+
         # Act
         sc.insert('test')
         actual = sc.query('t')
@@ -29,6 +31,7 @@ def test_trie_inserts_correctly(self):
 
     def test_trie_query_correctly(self):
         """ Test if the query function works """
+
         # Arrange
         sc = SpellChecker()
         sc.insert('cat')

diff --git a/src/text_extraction/main.py b/src/text_extraction/main.py
@@ -1,6 +1,6 @@
 """ Entrance file for the text extraction step of step 1 of the Knox pipeline """
-from text_extraction.text_extractor import TextExtractor
 from folder_watcher.folder_watcher import FolderWatcher
+from text_extraction.text_extractor import TextExtractor
 
 if __name__ == '__main__':
     text_extractor = TextExtractor()

diff --git a/src/text_extraction/test/test_check_text.py b/src/text_extraction/test/test_check_text.py
@@ -11,8 +11,9 @@ class TextExtractionTests(unittest.TestCase):
     """unit testing class"""
 
     def test_compare_text(self):
-        #Arrange
         """perform ocr on the test image and extracts the words into an extracted.txt file"""
+
+        #Arrange
         #generate a txt file called extracted.txt from test.png here
         extracted_text = get_word_stream('extracted.txt')
         expected_text = get_word_stream('expected.txt')

diff --git a/src/text_extraction/text_extractor.py b/src/text_extraction/text_extractor.py
@@ -6,15 +6,13 @@
 import pytesseract
 from pdf2image import convert_from_path
 
-# PDF_file = Path(r"./testdata/test5.pdf")
 @dataclasses.dataclass
 class TextExtractor():
     """ Text extraction interface """
     def __init__(self):
         self.out_dir = "/watched/spell_checking/"
         self.dpi = 500
         self.image_file_list = []
-        # self.queue = queue
 
     def read(self, input_file):
         """ Inner function that converts and reads PDFs """