From 1f28585ba15c69fa34f87bbb6a1a584ad4523d41 Mon Sep 17 00:00:00 2001
From: Daniel-VM <danielvmillares@gmail.com>
Date: Tue, 9 Apr 2024 12:22:23 +0200
Subject: [PATCH] add reviewer sugestions #258

---
 relecov_tools/__main__.py                     |   1 -
 .../assets/pipeline_utils/viralrecon.py       |  75 +++++++--
 relecov_tools/conf/bioinfo_config.json        |   4 +-
 relecov_tools/read_bioinfo_metadata.py        | 159 ++++++++++++++----
 4 files changed, 192 insertions(+), 47 deletions(-)

diff --git a/relecov_tools/__main__.py b/relecov_tools/__main__.py
index 75ded811..91c6c3ca 100755
--- a/relecov_tools/__main__.py
+++ b/relecov_tools/__main__.py
@@ -17,7 +17,6 @@
 import relecov_tools.map_schema
 import relecov_tools.feed_database
 import relecov_tools.read_bioinfo_metadata
-import relecov_tools.long_table_parse
 import relecov_tools.metadata_homogeneizer
 import relecov_tools.gisaid_upload
 import relecov_tools.upload_ena_protocol
diff --git a/relecov_tools/assets/pipeline_utils/viralrecon.py b/relecov_tools/assets/pipeline_utils/viralrecon.py
index 07240c05..56552ea4 100644
--- a/relecov_tools/assets/pipeline_utils/viralrecon.py
+++ b/relecov_tools/assets/pipeline_utils/viralrecon.py
@@ -30,7 +30,6 @@ class LongTableParse:
     - saving_file(generated_JSON)
     - parsing_csv() : It manages all this proccess:
         - calling first to parse_a_list_of_dictionaries() and then calling to saving_file()
-
     """
 
     def __init__(self, file_path=None, output_directory=None):
@@ -136,17 +135,32 @@ def convert_to_json(self, samp_dict):
         j_list = []
         # Grab date from filename
         result_regex = re.search(
-            "variants_long_table_(.*).csv", os.path.basename(self.file_path)
+            "variants_long_table(?:_\d{8})?\.csv", os.path.basename(self.file_path)
         )
+        stderr.print(result_regex.group(0))
         if result_regex is None:
-            log.error("Analysis date not found in filename, aborting")
             stderr.print(
-                "[red]Error: filename must include analysis date in format YYYYMMDD"
+                "[red]\tWARN: Couldn't find variants long table file. Expected file name is:"
+            )
+            stderr.print(
+                "[red]\t\t- variants_long_table.csv or variants_long_table_YYYYMMDD.csv. Aborting..."
             )
-            stderr.print("[red]e.g. variants_long_table_20220830.csv")
             sys.exit(1)
+        else:
+            date_regex = re.search(r"(\d{8})", result_regex.group())
+            if date_regex is not None:
+                analysis_date = date_regex
+                stderr.print(
+                    f"[green]\tDate {analysis_date.group()} found in {self.file_path}"
+                )
+            else:
+                analysis_date = "Not Provided [GENEPIO:0001668]"
+                stderr.print(
+                    f"[yellow]\tWARN:No analysis date found in long table: {self.file_path}"
+                )
+
         for key, values in samp_dict.items():
-            j_dict = {"sample_name": key, "analysis_date": result_regex.group(1)}
+            j_dict = {"sample_name": key, "analysis_date": analysis_date.group()}
             j_dict["variants"] = values
             j_list.append(j_dict)
         return j_list
@@ -182,12 +196,20 @@ def parsing_csv(self):
 
 # START util functions
 def handle_pangolin_data(files_list):
-    """File handler to parse pangolin data (csv) into JSON structured format."""
+    """File handler to parse pangolin data (csv) into JSON structured format.
+
+    Args:
+        files_list (list): A list with paths to pangolin files.
+
+    Returns:
+        pango_data_processed: A dictionary containing pangolin data handled.
+    """
     method_name = f"{handle_pangolin_data.__name__}"
     method_log_report = BioinfoReportLog()
 
     # Handling pangolin data
     pango_data_processed = {}
+    valid_samples = []
     try:
         files_list_processed = relecov_tools.utils.select_most_recent_files_per_sample(
             files_list
@@ -208,23 +230,39 @@ def handle_pangolin_data(files_list):
                     key.split()[0]: value for key, value in pango_data.items()
                 }
                 pango_data_processed.update(pango_data_updated)
-                method_log_report.update_log_report(
-                    method_name, "valid", f"Successfully handled data in {pango_file}."
-                )
+                valid_samples.append(pango_data_key.split()[0])
             except (FileNotFoundError, IndexError) as e:
                 method_log_report.update_log_report(
-                    method_name, "error", f"Error processing file {pango_file}: {e}"
+                    method_name,
+                    "warning",
+                    f"Error occurred while processing file {pango_file}: {e}",
                 )
-                sys.exit(method_log_report.print_log_report(method_name, ["error"]))
+                continue
     except Exception as e:
         method_log_report.update_log_report(
-            method_name, "error", f"Error occurred while processing files: {e}"
+            method_name, "warning", f"Error occurred while processing files: {e}"
+        )
+    if len(valid_samples) > 0:
+        method_log_report.update_log_report(
+            method_name,
+            "valid",
+            f"Successfully handled data in samples: {', '.join(valid_samples)}",
         )
-        sys.exit(method_log_report.print_log_report(method_name, ["error"]))
+    method_log_report.print_log_report(method_name, ["valid", "warning"])
     return pango_data_processed
 
 
 def parse_long_table(files_list):
+    """File handler to retrieve data from long table files and convert it into a JSON structured format.
+    This function utilizes the LongTableParse class to parse the long table data.
+    Since this utility handles and maps data using a custom way, it returns None to be avoid being  transferred to method read_bioinfo_metadata.BioinfoMetadata.mapping_over_table().
+
+    Args:
+        files_list (list): A list of paths to long table files.
+
+    Returns:
+        None: Indicates that the function does not return any meaningful value.
+    """
     method_name = f"{parse_long_table.__name__}"
     method_log_report = BioinfoReportLog()
 
@@ -250,7 +288,14 @@ def parse_long_table(files_list):
 
 
 def handle_consensus_fasta(files_list):
-    """File handler to parse consensus fasta data (*.consensus.fa) into JSON structured format"""
+    """File handler to parse consensus data (fasta) into JSON structured format.
+
+    Args:
+        files_list (list): A list with paths to condensus files.
+
+    Returns:
+        consensus_data_processed: A dictionary containing consensus data handled.
+    """
     method_name = f"{handle_consensus_fasta.__name__}"
     method_log_report = BioinfoReportLog()
 
diff --git a/relecov_tools/conf/bioinfo_config.json b/relecov_tools/conf/bioinfo_config.json
index 80a052b4..8789f387 100644
--- a/relecov_tools/conf/bioinfo_config.json
+++ b/relecov_tools/conf/bioinfo_config.json
@@ -1,7 +1,7 @@
 {
     "viralrecon": {
         "mapping_stats": {
-            "fn": "mapping_illumina_stats.tab",
+            "fn": "mapping_illumina_stats(?:_\\d{8})?\\.tab",
             "sample_col_idx": 5,
             "header_row_idx": 1,
             "required": true,
@@ -37,7 +37,7 @@
             }
         },
         "variants_long_table": {
-            "fn": "variants_long_table(?:_\\d{8})?\\.csv$",
+            "fn": "variants_long_table(?:_\\d{8})?\\.csv",
             "sample_col_idx": 1,
             "header_row_idx": 1,
             "required": true,
diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py
index b49a37d6..cbfb7251 100755
--- a/relecov_tools/read_bioinfo_metadata.py
+++ b/relecov_tools/read_bioinfo_metadata.py
@@ -26,7 +26,19 @@ def __init__(self, log_report=None):
             self.report = log_report
 
     def update_log_report(self, method_name, status, message):
-        """Update progress log report"""
+        """Update the progress log report with the given method name, status, and message.
+
+        Args:
+            method_name (str): The name of the method being logged.
+            status (str): The status of the log message, can be one of 'valid', 'error', or 'warning'.
+            message (str): The message to be logged.
+
+        Returns:
+            dict: The updated progress log report.
+
+        Raises:
+            ValueError: If an invalid status is provided.
+        """
         if status == "valid":
             self.report["valid"].setdefault(method_name, []).append(message)
             return self.report
@@ -40,7 +52,15 @@ def update_log_report(self, method_name, status, message):
             raise ValueError("Invalid status provided.")
 
     def print_log_report(self, name, sections):
-        """Calls log report printer."""
+        """Prints the log report by calling util's function.
+
+        Args:
+            name (str): The name of the log report.
+            sections (list of str): The sections of the log report to be printed.
+
+        Returns:
+            None
+        """
         relecov_tools.utils.print_log_report(self.report, name, sections)
 
 
@@ -115,13 +135,24 @@ def __init__(
             )
 
     def get_available_software(self, json):
-        """Get list of available software in configuration"""
+        """Get list of available software in configuration
+
+        Args:
+            json (str): Path to bioinfo configuration json file.
+
+        Returns:
+            available_software: List containing available software defined in json.
+        """
         config = relecov_tools.utils.read_json_file(json)
         available_software = list(config.keys())
         return available_software
 
     def scann_directory(self):
-        """Scanns bioinfo analysis directory and identifies files according to the file name patterns defined in the software configuration json."""
+        """Scanns bioinfo analysis directory and identifies files according to the file name patterns defined in the software configuration json.
+
+        Returns:
+            files_found: A dictionary containing file paths found based on the definitions provided in the bioinformatic JSON file within the software scope (self.software_config).
+        """
         method_name = f"{self.scann_directory.__name__}"
         total_files = sum(len(files) for _, _, files in os.walk(self.input_folder))
         files_found = {}
@@ -159,7 +190,11 @@ def scann_directory(self):
             return files_found
 
     def validate_software_mandatory_files(self, files_dict):
-        """ "Validates the presence of all mandatory files as defined in the software configuration JSON."""
+        """Validates the presence of all mandatory files as defined in the software configuration JSON.
+
+        Args:
+            files_dict (dict{str:str}): A dictionary containing file paths found based on the definitions provided in the bioinformatic JSON file within the software scope (self.software_config).
+        """
         method_name = f"{self.validate_software_mandatory_files.__name__}"
         missing_required = []
         for key in self.software_config:
@@ -182,14 +217,21 @@ def validate_software_mandatory_files(self, files_dict):
             self.log_report.update_log_report(
                 method_name, "valid", "Successfull validation of mandatory files."
             )
-        self.log_report.print_log_report(method_name, ["valid", "waring"])
+        self.log_report.print_log_report(method_name, ["valid", "warning"])
         return
 
     def add_bioinfo_results_metadata(self, files_dict, j_data):
-        """
-        Adds metadata from bioinformatics results to j_data.
-            1. Handles metadata in bioinformatic files found.
-            2. Mapping handled bioinfo metadata into j_data.
+        """Adds metadata from bioinformatics results to j_data.
+        It first calls file_handlers and then maps the handled
+        data into j_data.
+
+        Args:
+            files_dict (dict{str:str}): A dictionary containing file paths found based on the definitions provided in the bioinformatic JSON file within the software scope (self.software_config).
+
+            j_data (list(dict{str:str}): A list of dictionaries containing metadata lab (list item per sample).
+
+        Returns:
+            j_data_mapped: A list of dictionaries with bioinformatics metadata mapped into j_data.
         """
         method_name = f"{self.add_bioinfo_results_metadata.__name__}"
         for key in self.software_config.keys():
@@ -225,13 +267,12 @@ def add_bioinfo_results_metadata(self, files_dict, j_data):
                     f"No metadata found to perform standard mapping when processing '{self.software_name}.{key}'",
                 )
                 continue
-        self.log_report.print_log_report(method_name, ["valid", "waring"])
+        self.log_report.print_log_report(method_name, ["valid", "warning"])
         return j_data_mapped
 
     def handling_files(self, file_list):
-        """
+        """Handles different file formats to extract data regardless of their structure. The goal is to extract the data contained in files specified in ${file_list}, using either 'standard' handlers defined in this class or pipeline-specific file handlers.
         (inspired from ./metadata_homogenizer.py)
-        Handles different file formats to extract data regardless of their structure. The goal is to extract the data contained in files specified in ${file_list}, using either 'standard' handlers defined in this class or pipeline-specific file handlers.
 
         A file handler method must generate a data structure as follow:
             {
@@ -249,11 +290,11 @@ def handling_files(self, file_list):
             }
         Note: ensure that 'field1','field2','field3' corresponds with the values especifies in the 'content' section of each software configuration scope (see: conf/bioinfo_config.json).
 
-        Input:
+        Args:
             file_list (list): A list of file path/s to be processed.
 
         Returns:
-            dict: A single dictionary containing extracted data for each sample.
+            data: A dictionary containing bioinfo metadata handled for each sample.
         """
         method_name = f"{self.add_bioinfo_results_metadata.__name__}:{self.handling_files.__name__}"
         file_name = self.software_config[self.current_config_key].get("fn")
@@ -309,20 +350,28 @@ def handling_files(self, file_list):
         return data
 
     def mapping_over_table(self, j_data, map_data, mapping_fields, table_name):
-        """
-        Function that maps structure data containing fields per sample into j_data.
+        """Maps bioinformatics metadata from map_data to j_data based on the mapping_fields.
+
+        Args:
+            j_data (list(dict{str:str}): A list of dictionaries containing metadata lab (one item per sample).
+            map_data (dict(dict{str:str})): A dictionary containing bioinfo metadata handled by the method handling_files().
+            mapping_fields (dict{str:str}): A dictionary of mapping fields defined in the 'content' definition under each software scope (see conf/bioinfo.config).
+            table_name (str): Path to the mapping file/table.
+
+        Returns:
+            j_data: updated j_data with bioinformatic metadata mapped in it.
         """
         method_name = f"{self.mapping_over_table.__name__}:{self.software_name}.{self.current_config_key}"
         errors = []
         field_errors = {}
-        field_vaild = {}
+        field_valid = {}
         for row in j_data:
-            sample_name = row["sequencing_sample_id"].replace("-", "_")
+            sample_name = row["sequencing_sample_id"]
             if sample_name in map_data.keys():
                 for field, value in mapping_fields.items():
                     try:
                         row[field] = map_data[sample_name][value]
-                        field_vaild[sample_name] = {field: value}
+                        field_valid[sample_name] = {field: value}
                     except KeyError as e:
                         field_errors[sample_name] = {field: e}
                         row[field] = "Not Provided [GENEPIO:0001668]"
@@ -357,13 +406,21 @@ def mapping_over_table(self, j_data, map_data, mapping_fields, table_name):
             self.log_report.update_log_report(
                 method_name,
                 "valid",
-                f"Successfully mapped fields in {', '.join(field_vaild.keys())} - {table_name}.",
+                f"Successfully mapped fields in {', '.join(field_valid.keys())} - {table_name}.",
             )
         self.log_report.print_log_report(method_name, ["valid", "warning"])
         return j_data
 
     def get_multiqc_software_versions(self, file_list, j_data):
-        """Reads multiqc html file, finds table containing software version info, and map it to j_data"""
+        """Reads multiqc html file, finds table containing software version info, and map it to j_data
+
+        Args:
+            file_list (list): A list containing the path to file multiqc_report.html.
+            j_data (list(dict{str:str}): A list of dictionaries containing metadata lab (one item per sample).
+
+        Returns:
+            j_data: updated j_data with software version's info mapped in it.
+        """
         method_name = f"{self.get_multiqc_software_versions.__name__}"
         # Handle multiqc_report.html
         f_path = file_list[0]
@@ -437,7 +494,14 @@ def get_multiqc_software_versions(self, file_list, j_data):
         return j_data
 
     def add_fixed_values(self, j_data):
-        """include the fixed data defined in configuration or feed custom empty fields"""
+        """Add fixed values to j_data as defined in the bioinformatics configuration (definition: "fixed values")
+
+        Args:
+            j_data (list(dict{str:str}): A list of dictionaries containing metadata lab (one item per sample).
+
+        Returns:
+            j_data: updated j_data with fixxed values added in it.
+        """
         method_name = f"{self.add_fixed_values.__name__}"
         try:
             f_values = self.software_config["fixed_values"]
@@ -452,13 +516,37 @@ def add_fixed_values(self, j_data):
                 method_name, "warning", f"Error found while adding fixed values: {e}"
             )
             pass
-        self.log_report.print_log_report(method_name, ["valid", "waring"])
+        self.log_report.print_log_report(method_name, ["valid", "warning"])
         return j_data
 
     def add_bioinfo_files_path(self, files_found_dict, j_data):
-        """Adds file paths (essential for handlers and mapping methods to process bioinformatics metadata) to the j_data. In instances where multiple files are identified per configuration item (e.g., viralrecon.mapping_consensus → *.consensus.fa), each sample in j_data receives its respective file path. If no file path is located, the function appends "Not Provided [GENEPIO:0001668]" to indicate missing data.g file. If no file path is found, then adds "Not Provided [GENEPIO:0001668]"""
+        """Adds file paths essential for handling and mapping bioinformatics metadata to the j_data.
+        For each sample in j_data, the function assigns the corresponding file path based on the identified files in files_found_dict.
+        If multiple files are identified per configuration item (e.g., viralrecon.mapping_consensus → *.consensus.fa), each sample in j_data receives its respective file path.
+        If no file path is located, the function appends "Not Provided [GENEPIO:0001668]" to indicate missing data.
+
+        Args:
+            files_found_dict (dict): A dictionary containing file paths identified for each configuration item.
+            j_data (list(dict{str:str}): A list of dictionaries containing metadata lab (one item per sample).
+
+        Returns:
+            j_data: Updated j_data with file paths mapped for bioinformatic metadata.
+        """
+        method_name = f"{self.add_bioinfo_files_path.__name__}"
+        sample_error = 0
         for row in j_data:
-            sample_name = row["sequencing_sample_id"]
+            try:
+                sample_name = re.match(
+                    r"^(.*?)_R1\.fastq\.gz", row["sequence_file_R1_fastq"]
+                ).group(1)
+            except AttributeError as e:
+                sample_error += 1
+                self.log_report.update_log_report(
+                    method_name,
+                    "warning",
+                    f" {row['sequence_file_R1_fastq']} doesn't match pattern '*_R1.fastq.gz'. Cannot add file paths (error: {e})",
+                )
+                continue
             for key, value in files_found_dict.items():
                 file_path = "Not Provided [GENEPIO:0001668]"
                 if value:  # Check if value is not empty
@@ -471,11 +559,21 @@ def add_bioinfo_files_path(self, files_found_dict, j_data):
                         file_path = value[0]
                 path_key = f"{self.software_name}_filepath_{key}"
                 row[path_key] = file_path
+        self.log_report.print_log_report(method_name, ["warning"])
+        if sample_error == 0:
+            self.log_report.update_log_report(
+                method_name, "valid", "File paths added successfully."
+            )
+            self.log_report.print_log_report(method_name, ["valid"])
         return j_data
 
     def collect_info_from_lab_json(self):
-        """Create the list of dictionaries from the data that is on json lab
-        metadata file. Return j_data that is used to add the rest of the fields
+        """Reads lab metadata from a JSON file and creates a list of dictionaries.
+        Reads lab metadata from the specified JSON file and converts it into a list of dictionaries.
+        This list is used to add the rest of the fields.
+
+        Returns:
+            json_lab_data: A list of dictionaries containing lab metadata (aka j_data).
         """
         method_name = f"{self.collect_info_from_lab_json.__name__}"
         try:
@@ -494,7 +592,10 @@ def collect_info_from_lab_json(self):
     def create_bioinfo_file(self):
         """Create the bioinfodata json with collecting information from lab
         metadata json, mapping_stats, and more information from the files
-        inside input directory
+        inside input directory.
+
+        Returns:
+            bool: True if the bioinfo file creation process was successful.
         """
         # Find and validate bioinfo files
         stderr.print("[blue]Sanning input directory...")