Wazzabeee · Wazzabeee · May 4, 2024 · May 4, 2024
diff --git a/README.md b/README.md
@@ -19,6 +19,14 @@ $ copy-spotter [-s] [-o] [-h] input_directory
 ***Positional Arguments:***
 * `input_directory`: Directory that contains one folder per pdf file (see `data/pdf/plagiarism` for example)
 
+```
+input_directory/
+│
+├── file_1.docx
+├── file_2.pdf
+└── file_3.pdf
+```
+
 ***Optional Arguments:***
 * `-s`, `--block-size`: Set minimum number of consecutive and similar words detected. (Default is 2)
 * `-o`, `--out_dir`: Set the output directory for html files. (Default is creating a new directory called results)
@@ -72,8 +80,4 @@ $ python -m scripts.main [-s] [-o] [-h] input_directory
 ---
 - Add more tests on existing functions
 - Implement OCR with tesseract for scanned documents
-- Add info in console for timing (tqdm)
-- Add CSS to HTML Template to make the results better looking
-- Add support for other folder structures (right now the package is expecting one pdf files per folder)
-- Add custom naming option for pdf files
-- Fix Slate3k by installing custom fork (check if still relevant)
+- Add custom naming option for pdf files
diff --git a/data/pdf/plagiarism/Axel Mare_report/report.txt b/data/pdf/plagiarism/Axel Mare_report/report.txt
diff --git a/data/pdf/plagiarism/John Doe_report/report_2.txt b/data/pdf/plagiarism/John Doe_report/report_2.txt
diff --git a/data/pdf/plagiarism/Lucas Pelipe_report/random_txt.txt b/data/pdf/plagiarism/Lucas Pelipe_report/random_txt.txt
diff --git a/data/pdf/plagiarism/Marie Pole_report/final_version.txt b/data/pdf/plagiarism/Marie Pole_report/final_version.txt
diff --git a/scripts/main.py b/scripts/main.py
@@ -18,7 +18,7 @@
 from scripts.html_utils import writing_results
 from scripts.processing_files import file_extension_call
 from scripts.similarity import difflib_overlap
-from scripts.utils import wait_for_file, get_student_names, parse_options
+from scripts.utils import wait_for_file, parse_options
 
 
 class MinimumFilesError(Exception):
@@ -62,7 +62,7 @@ def main() -> None:
         in_dir = path.abspath(in_dir)
 
     files = [
-        f for f in listdir(in_dir) if path.isdir(path.join(in_dir, f)) or f.endswith(("txt", "pdf", "docx", "odt"))
+        f for f in listdir(in_dir) if path.isfile(path.join(in_dir, f)) and f.endswith(("txt", "pdf", "docx", "odt"))
     ]
 
     if len(files) < 2:
@@ -71,19 +71,14 @@ def main() -> None:
         )
 
     filenames, processed_files = [], []
-    students_names = get_student_names(in_dir)
-
-    for ind, direc in enumerate(tqdm(listdir(in_dir), desc="Processing Directories")):
-        if path.isdir(path.join(in_dir, direc)):
-            for file in listdir(path.join(in_dir, direc)):
-                file_words = file_extension_call(str(path.join(in_dir, direc, file)))
-                if file_words:  # If all files have supported format
-                    processed_files.append(file_words)
-                    filenames.append(students_names[ind])
-                else:
-                    raise UnsupportedFileError(
-                        "Remove files which are not txt, pdf, docx, or odt and run the script again."
-                    )
+
+    for file in tqdm(files, desc="Processing Files"):
+        file_words = file_extension_call(str(path.join(in_dir, file)))
+        if file_words:  # If all files have supported format
+            processed_files.append(file_words)
+            filenames.append(path.splitext(file)[0])
+        else:
+            raise UnsupportedFileError("Remove files which are not txt, pdf, docx, or odt and run the script again.")
 
     if out_dir is not None and path.exists(out_dir):
         if not path.isabs(out_dir):

diff --git a/scripts/processing_files.py b/scripts/processing_files.py
@@ -13,33 +13,29 @@
 def get_file_extension(filepath: str) -> str:
     """Return the file extension of the file at the specified path"""
     if not path.isfile(filepath):
-        print("Invalid file path")
-        return ""
+        raise ValueError(f"Invalid file path: {filepath}")
 
     try:
         return path.splitext(filepath)[1]
     except IndexError:
-        print("File extension error")
-        return ""
+        raise ValueError(f"File extension error for file: {filepath}")
 
 
 def file_extension_call(file: str) -> list:
     """Map file extension to appropriate function"""
 
     extension = get_file_extension(file)
 
-    if extension:
-        if extension == ".pdf":
-            return get_words_from_pdf_file(file)
-        if extension == ".docx":
-            return get_words_from_docx_file(file)
-        if extension == ".odt":
-            return get_words_from_odt_file(file)
-        if extension == ".txt":
-            return get_words_from_txt_file(file)
-
-    print("File format is not supported. Please convert to pdf, docx, odt or txt")
-    return []
+    if extension == ".pdf":
+        return get_words_from_pdf_file(file)
+    elif extension == ".docx":
+        return get_words_from_docx_file(file)
+    elif extension == ".odt":
+        return get_words_from_odt_file(file)
+    elif extension == ".txt":
+        return get_words_from_txt_file(file)
+    else:
+        raise ValueError(f"File format not supported for file: {file}. " f"Please convert to pdf, docx, odt, or txt")
 
 
 def get_words_from_pdf_file(pdf_path: str) -> list: