From abd6c97c6284c8d235683492a7b8dc4343286364 Mon Sep 17 00:00:00 2001
From: Alejandro <aberdur7@gmail.com>
Date: Fri, 7 Feb 2025 11:10:49 +0100
Subject: [PATCH] Improved build-schema module

---
 .../schema_utils/metadatalab_template.py      |  48 ++++-
 relecov_tools/build_schema.py                 | 184 ++++++++++++++++--
 2 files changed, 201 insertions(+), 31 deletions(-)

diff --git a/relecov_tools/assets/schema_utils/metadatalab_template.py b/relecov_tools/assets/schema_utils/metadatalab_template.py
index db4d4a1d..4f15cd26 100644
--- a/relecov_tools/assets/schema_utils/metadatalab_template.py
+++ b/relecov_tools/assets/schema_utils/metadatalab_template.py
@@ -67,7 +67,6 @@ def schema_properties_to_df(json_data):
         stderr.print(f"[red]Error in schema_properties_to_df: {e}")
         return None
 
-
 def excel_formater(df, writer, sheet, out_file, have_index=True, have_header=True):
     try:
 
@@ -80,24 +79,41 @@ def excel_formater(df, writer, sheet, out_file, have_index=True, have_header=Tru
         workbook = writer.book
         worksheet = writer.sheets[sheet]
 
-        # setup excel format
+        # Set up general column width
         worksheet.set_column(0, len(df.columns), 30)
+
+        # General header format
         header_formater = workbook.add_format(
             {
                 "bold": True,
                 "text_wrap": False,
                 "valign": "top",
-                "fg_color": "#ADD8E6",
+                "fg_color": "#B9DADE",  # Light blue
                 "border": 1,
                 "locked": True,
             }
         )
+
+        # Custom header format for METADATA_LAB (red text starting from column 2)
+        red_header_formater = workbook.add_format(
+            {
+                "bold": True,
+                "text_wrap": False,
+                "valign": "top",
+                "fg_color": "#B9DADE",  # Light blue background
+                "color": "#E05959",  # Red text color
+                "border": 1,
+                "locked": True,
+            }
+        )
+
+        # First column format
         first_col_formater = workbook.add_format(
             {
                 "bold": True,
                 "text_wrap": False,
                 "valign": "center",
-                "fg_color": "#ADD8E6",
+                "fg_color": "#B9DADE",  # Light blue
                 "border": 1,
                 "locked": True,
             }
@@ -107,7 +123,7 @@ def excel_formater(df, writer, sheet, out_file, have_index=True, have_header=Tru
             # Write the column headers with the defined format.
             for col_num, value in enumerate(df.columns.values):
                 try:
-                    worksheet.write(0, col_num + 1, value, header_formater)
+                    worksheet.write(0, col_num, value, header_formater)
                 except Exception as e:
                     stderr.print(f"Error writing header at column {col_num + 1}: {e}")
 
@@ -124,10 +140,10 @@ def excel_formater(df, writer, sheet, out_file, have_index=True, have_header=Tru
             # Write the column headers with the defined format.
             for col_num in range(0, len(df.columns)):
                 for row_num in range(0, len(df)):
-                    if row_num < 3:
+                    if row_num < 4:
                         try:
                             worksheet.write(
-                                row_num + 1,
+                                row_num,
                                 col_num + 1,
                                 df.iloc[row_num, col_num],
                                 header_formater,
@@ -136,12 +152,24 @@ def excel_formater(df, writer, sheet, out_file, have_index=True, have_header=Tru
                             stderr.print(
                                 f"Error writing first column at row {row_num}: {e}"
                             )
-
+                        if row_num == 0 and col_num >= 1 and sheet == "METADATA_LAB":
+                            try:
+                                worksheet.write(
+                                    row_num,
+                                    col_num,
+                                    df.iloc[row_num, col_num],
+                                    red_header_formater,  # Aplicar formato
+                                )
+                            except Exception as e:
+                                stderr.print(
+                                    f"Error writing first row at column {col_num}: {e}"
+                                )
+            print(df.index)
             # Write the first column with the defined format.
             for index_num, index_val in enumerate(df.index):
                 try:
-                    worksheet.write(index_num + 1, 0, index_val, first_col_formater)
+                    worksheet.write(index_num, 0, index_val, first_col_formater)
                 except Exception as e:
                     stderr.print(f"Error writing first column at row {row_num}: {e}")
     except Exception as e:
-        stderr.print(f"Error in excel_formater: {e}")
+        stderr.print(f"Error in excel_formater: {e}")
\ No newline at end of file
diff --git a/relecov_tools/build_schema.py b/relecov_tools/build_schema.py
index d1f90922..e9ad7216 100644
--- a/relecov_tools/build_schema.py
+++ b/relecov_tools/build_schema.py
@@ -4,6 +4,7 @@
 import rich.console
 import pandas as pd
 import os
+import re
 import openpyxl
 import sys
 import json
@@ -14,6 +15,7 @@
 import relecov_tools.assets.schema_utils.jsonschema_draft
 import relecov_tools.assets.schema_utils.metadatalab_template
 from relecov_tools.config_json import ConfigJson
+from datetime import datetime
 from openpyxl.worksheet.datavalidation import DataValidation
 
 log = logging.getLogger(__name__)
@@ -33,6 +35,7 @@ def __init__(
         draft_version=None,
         show_diff=None,
         out_dir=None,
+        version=None,
     ):
         """
         Initialize the SchemaBuilder class. This class generates a JSON Schema file based on the provided draft version.
@@ -53,6 +56,9 @@ def __init__(
         else:
             self.output_folder = out_dir
 
+        # Get version option
+        self.version = version
+
         # Validate show diff option
         if not show_diff:
             self.show_diff = None
@@ -313,6 +319,7 @@ def build_new_schema(self, json_data, schema_draft):
                 "label_name": "label",
                 "fill_mode": "fill_mode",
                 "required (Y/N)": "required",
+                "submitting_lab_form": "header",
             }
             required_property_unique = []
 
@@ -424,10 +431,10 @@ def get_schema_diff(self, base_schema, new_schema):
 
         if not diff_lines:
             log.info(
-                "No differencess were found between already installed and new generated schema. Exiting. No changes made"
+                "No differences were found between already installed and new generated schema. Exiting. No changes made"
             )
             stderr.print(
-                "[yellow]No differencess were found between already installed and new generated schema. Exiting. No changes made"
+                "[yellow]No differences were found between already installed and new generated schema. Exiting. No changes made"
             )
             return None
         else:
@@ -441,11 +448,11 @@ def get_schema_diff(self, base_schema, new_schema):
 
     def print_save_schema_diff(self, diff_lines=None):
         # Set user's choices
-        choices = ["Print to sandard output (stdout)", "Save to file", "Both"]
+        choices = ["Print to standard output (stdout)", "Save to file", "Both"]
         diff_output_choice = relecov_tools.utils.prompt_selection(
             "How would you like to print the diff between schemes?:", choices
         )
-        if diff_output_choice in ["Print to sandard output (stdout)", "Both"]:
+        if diff_output_choice in ["Print to standard output (stdout)", "Both"]:
             for line in diff_lines:
                 print(line)
             return True
@@ -471,7 +478,10 @@ def save_new_schema(self, json_data):
             bool: True if the schema was successfully saved, False otherwise.
         """
         try:
-            path_to_save = self.output_folder + "/relecov_schema.json"
+            if not self.version:
+                raise ValueError("The next_version variable is not set.")
+
+            path_to_save = f"{self.output_folder}/relecov_schema_v{self.version}.json"
             with open(path_to_save, "w") as schema_file:
                 json.dump(json_data, schema_file, ensure_ascii=False, indent=4)
             log.info(f"New JSON schema saved to: {path_to_save}")
@@ -492,26 +502,90 @@ def save_new_schema(self, json_data):
     # FIXME: overview-tab - Still need to add the column that maps to tab metadatalab
     def create_metadatalab_excel(self, json_schema):
         """
-        Generate an Excel template file for Metadata LAB with three tabs: Overview, Metadata LAB, and Data Validation.
+        Generates an Excel template file for Metadata LAB with four sheets:
+        Overview, Metadata LAB, Data Validation, and Version History.
 
         Args:
-            json_schema (dict): The JSON Schema from which the Excel template is generated. It should include properties and required fields.
+            json_schema (dict): The JSON schema used to generate the template.
+                                It should include properties and required fields.
 
         Returns:
-            None: if any error occurs during the process.
+            None: If an error occurs during the process.
         """
         try:
-            # Set up metadatalab configuration
+            # Retrieve existing files in the output directory
+            output_files = os.listdir(self.output_folder)
+            notes_control_input = input(
+                "\033[93mEnter a note about changes made to the schema: \033[0m"
+            )
+            # Identify existing template files
+            template_files = [
+                f for f in output_files if f.startswith("Relecov_metadata_template")
+            ]
+            if template_files:
+                # Extract the latest version number from existing files
+                latest_file = max(
+                    template_files,
+                    key=lambda x: (
+                        re.search(r"v(\d+\.\d+\.\d+)", x).group(1)
+                        if re.search(r"v(\d+\.\d+\.\d+)", x)
+                        else "0"
+                    ),
+                )
+                match = re.search(r"v(\d+\.\d+\.\d+)", latest_file)
+                if match:
+                    # Load the latest template file and attempt to read version history
+                    out_file = os.path.join(self.output_folder, latest_file)
+                    version_history = pd.DataFrame(
+                        columns=["FILE_VERSION", "CODE", "NOTES CONTROL", "DATE"]
+                    )
+
+                    try:
+                        wb = openpyxl.load_workbook(out_file)
+                        if "VERSION" in wb.sheetnames:
+                            ws_version = wb["VERSION"]
+                            data = ws_version.values
+                            columns = next(data)
+                            version_history = pd.DataFrame(data, columns=columns)
+                    except Exception as e:
+                        log.warning(f"Error reading previous VERSION sheet: {e}")
+                    next_version = self.version
+                else:
+                    next_version = "1.0.0"
+                    out_file = os.path.join(
+                        self.output_folder,
+                        f"Relecov_metadata_template_v{next_version}.xlsx",
+                    )
+            else:
+                next_version = "1.0.0"
+                out_file = os.path.join(
+                    self.output_folder,
+                    f"Relecov_metadata_template_v{next_version}.xlsx",
+                )
+            # Store versioning information
+            version_info = {
+                "FILE_VERSION": f"Relecov_metadata_template_v{next_version}",
+                "CODE": next_version,
+                "NOTES CONTROL": notes_control_input,
+                "DATE": datetime.now().strftime("%Y-%m-%d"),
+            }
+            version_history = pd.concat(
+                [version_history, pd.DataFrame([version_info])], ignore_index=True
+            )
             out_file = os.path.join(
-                self.output_folder, "metadatalab_template" + ".xlsx"
+                self.output_folder, f"Relecov_metadata_template_v{next_version}.xlsx"
             )
+
+            # Define required metadata classifications
             required_classification = [
                 "Database Identifiers",
                 "Sample collection and processing",
                 "Host information",
                 "Sequencing",
-                "Pathogen Diagnostic testing",
+                "Pathogen diagnostic testing",
                 "Contributor Acknowledgement",
+                "Public databases",
+                "Bioinformatics and QC metrics fields",
             ]
             required_properties = json_schema.get("required")
             schema_properties = json_schema.get("properties")
@@ -524,6 +598,7 @@ def create_metadatalab_excel(self, json_schema):
                 df = relecov_tools.assets.schema_utils.metadatalab_template.schema_properties_to_df(
                     schema_properties_flatten
                 )
+                # Filter metadata fields based on required classifications
                 df = df[df["classification"].isin(required_classification)]
                 df["required"] = df["property_id"].apply(
                     lambda x: "Y" if x in required_properties else "N"
@@ -557,25 +632,34 @@ def create_metadatalab_excel(self, json_schema):
                 log.error(f"Error creating overview sheet: {e}")
                 stderr.print(f"Error creating overview sheet: {e}")
                 return None
-
-            # MetadataLab sheet
+            # Ensure 'header' column exists before filtering
+            if "header" in df.columns:
+                df["header"] = df["header"].astype(str).str.strip()
+                df_filtered = df[df["header"].str.upper() == "Y"]
+            else:
+                log.warning(
+                    "No se encontró la columna 'header', usando df sin filtrar."
+                )
+                df_filtered = df
+            # Create Metadata LAB sheet
             try:
-                metadatalab_header = ["EJEMPLOS", "DESCRIPCIÓN", "CAMPO"]
+                metadatalab_header = ["REQUERIDO", "EJEMPLOS", "DESCRIPCIÓN", "CAMPO"]
                 df_metadata = pd.DataFrame(
                     columns=[col_name for col_name in metadatalab_header]
                 )
-                df_metadata["EJEMPLOS"] = df["examples"].apply(
+                df_metadata["REQUERIDO"] = df_filtered["required"]
+                df_metadata["EJEMPLOS"] = df_filtered["examples"].apply(
                     lambda x: x[0] if isinstance(x, list) else x
                 )
-                df_metadata["DESCRIPCIÓN"] = df["description"]
-                df_metadata["CAMPO"] = df["label"]
+                df_metadata["DESCRIPCIÓN"] = df_filtered["description"]
+                df_metadata["CAMPO"] = df_filtered["label"]
                 df_metadata = df_metadata.transpose()
             except Exception as e:
                 log.error(f"Error creating MetadataLab sheet: {e}")
                 stderr.print(f"[red]Error creating MetadataLab sheet: {e}")
                 return None
 
-            # DataValidation sheet
+            # Create Data Validation sheet
             try:
                 datavalidation_header = ["EJEMPLOS", "DESCRIPCIÓN", "CAMPO"]
                 df_hasenum = df[(pd.notnull(df.enum))]
@@ -594,8 +678,7 @@ def create_metadatalab_excel(self, json_schema):
                 return None
 
             try:
-                # Since enums have different lengths we need further processing.
-                # Convert df into dict to perform data manipulation.
+
                 enum_dict = {property: [] for property in df_hasenum["property_id"]}
                 enum_maxitems = 0
                 # Populate the dictionary with flattened lists
@@ -634,6 +717,23 @@ def create_metadatalab_excel(self, json_schema):
                 stderr.print(f"[red]Error processing enums and combining data: {e}")
                 return None
 
+            #  Replace NaN, Inf values with empty strings
+            df_overview = (
+                df_overview.replace([float("inf"), float("-inf")], "")
+                .fillna("")
+                .infer_objects()
+            )
+            df_metadata = (
+                df_metadata.replace([float("inf"), float("-inf")], "")
+                .fillna("")
+                .infer_objects()
+            )
+            df_validation = (
+                df_validation.replace([float("inf"), float("-inf")], "")
+                .fillna("")
+                .infer_objects()
+            )
+
             # WRITE EXCEL
             try:
                 writer = pd.ExcelWriter(out_file, engine="xlsxwriter")
@@ -661,6 +761,7 @@ def create_metadatalab_excel(self, json_schema):
                     have_index=True,
                     have_header=False,
                 )
+                version_history.to_excel(writer, sheet_name="VERSION", index=False)
                 writer.close()
                 log.info(f"Metadata lab template successfuly created in: {out_file}")
                 stderr.print(
@@ -674,7 +775,8 @@ def create_metadatalab_excel(self, json_schema):
             try:
                 wb = openpyxl.load_workbook(out_file)
                 ws_metadata = wb["METADATA_LAB"]
-
+                ws_metadata.freeze_panes = "D1"
+                ws_metadata.delete_rows(5)
                 ws_dropdowns = (
                     wb.create_sheet("DROPDOWNS")
                     if "DROPDOWNS" not in wb.sheetnames
@@ -716,7 +818,47 @@ def create_metadatalab_excel(self, json_schema):
 
                         ws_metadata.add_data_validation(dropdown)
                         dropdown.add(dropdown_range_metadata)
+
+                if "OVERVIEW" in wb.sheetnames:
+                    ws_overview = wb["OVERVIEW"]
+                    ws_overview.protection.sheet = True
+                    ws_overview.protection.password = "password123"
+
+                if "DATA_VALIDATION" in wb.sheetnames:
+                    ws_data_validation = wb["DATA_VALIDATION"]
+                    ws_data_validation.protection.sheet = True
+                    ws_data_validation.protection.password = "password123"
+
+                if "VERSION" in wb.sheetnames:
+                    ws_data_validation = wb["VERSION"]
+                    ws_data_validation.protection.sheet = True
+                    ws_data_validation.protection.password = "password123"
+
+                    ws_version = wb["VERSION"]
+                    column_widths = []
+
+                    for col in ws_version.columns:
+                        max_length = 0
+                        column = col[0].column_letter
+                        for cell in col:
+                            try:
+                                if len(str(cell.value)) > max_length:
+                                    max_length = len(cell.value)
+                            except:
+                                pass
+                        adjusted_width = max_length + 2
+                        column_widths.append(adjusted_width)
+
+                    # Apply the calculated column width
+                    for i, width in enumerate(column_widths):
+                        ws_version.column_dimensions[
+                            openpyxl.utils.get_column_letter(i + 1)
+                        ].width = width
+
                 ws_dropdowns.sheet_state = "hidden"
+                ws_dropdowns.protection.sheet = True
+                ws_dropdowns.protection.password = "password123"
+
                 wb.save(out_file)
             except Exception as e:
                 log.error(f"Error adding dropdowns: {e}")