From abd6c97c6284c8d235683492a7b8dc4343286364 Mon Sep 17 00:00:00 2001 From: Alejandro Date: Fri, 7 Feb 2025 11:10:49 +0100 Subject: [PATCH] Improved build-schema module --- .../schema_utils/metadatalab_template.py | 48 ++++- relecov_tools/build_schema.py | 184 ++++++++++++++++-- 2 files changed, 201 insertions(+), 31 deletions(-) diff --git a/relecov_tools/assets/schema_utils/metadatalab_template.py b/relecov_tools/assets/schema_utils/metadatalab_template.py index db4d4a1d..4f15cd26 100644 --- a/relecov_tools/assets/schema_utils/metadatalab_template.py +++ b/relecov_tools/assets/schema_utils/metadatalab_template.py @@ -67,7 +67,6 @@ def schema_properties_to_df(json_data): stderr.print(f"[red]Error in schema_properties_to_df: {e}") return None - def excel_formater(df, writer, sheet, out_file, have_index=True, have_header=True): try: @@ -80,24 +79,41 @@ def excel_formater(df, writer, sheet, out_file, have_index=True, have_header=Tru workbook = writer.book worksheet = writer.sheets[sheet] - # setup excel format + # Set up general column width worksheet.set_column(0, len(df.columns), 30) + + # General header format header_formater = workbook.add_format( { "bold": True, "text_wrap": False, "valign": "top", - "fg_color": "#ADD8E6", + "fg_color": "#B9DADE", # Light blue "border": 1, "locked": True, } ) + + # Custom header format for METADATA_LAB (red text starting from column 2) + red_header_formater = workbook.add_format( + { + "bold": True, + "text_wrap": False, + "valign": "top", + "fg_color": "#B9DADE", # Light blue background + "color": "#E05959", # Red text color + "border": 1, + "locked": True, + } + ) + + # First column format first_col_formater = workbook.add_format( { "bold": True, "text_wrap": False, "valign": "center", - "fg_color": "#ADD8E6", + "fg_color": "#B9DADE", # Light blue "border": 1, "locked": True, } @@ -107,7 +123,7 @@ def excel_formater(df, writer, sheet, out_file, have_index=True, have_header=Tru # Write the column headers with the defined format. for col_num, value in enumerate(df.columns.values): try: - worksheet.write(0, col_num + 1, value, header_formater) + worksheet.write(0, col_num, value, header_formater) except Exception as e: stderr.print(f"Error writing header at column {col_num + 1}: {e}") @@ -124,10 +140,10 @@ def excel_formater(df, writer, sheet, out_file, have_index=True, have_header=Tru # Write the column headers with the defined format. for col_num in range(0, len(df.columns)): for row_num in range(0, len(df)): - if row_num < 3: + if row_num < 4: try: worksheet.write( - row_num + 1, + row_num, col_num + 1, df.iloc[row_num, col_num], header_formater, @@ -136,12 +152,24 @@ def excel_formater(df, writer, sheet, out_file, have_index=True, have_header=Tru stderr.print( f"Error writing first column at row {row_num}: {e}" ) - + if row_num == 0 and col_num >= 1 and sheet == "METADATA_LAB": + try: + worksheet.write( + row_num, + col_num, + df.iloc[row_num, col_num], + red_header_formater, # Aplicar formato + ) + except Exception as e: + stderr.print( + f"Error writing first row at column {col_num}: {e}" + ) + print(df.index) # Write the first column with the defined format. for index_num, index_val in enumerate(df.index): try: - worksheet.write(index_num + 1, 0, index_val, first_col_formater) + worksheet.write(index_num, 0, index_val, first_col_formater) except Exception as e: stderr.print(f"Error writing first column at row {row_num}: {e}") except Exception as e: - stderr.print(f"Error in excel_formater: {e}") + stderr.print(f"Error in excel_formater: {e}") \ No newline at end of file diff --git a/relecov_tools/build_schema.py b/relecov_tools/build_schema.py index d1f90922..e9ad7216 100644 --- a/relecov_tools/build_schema.py +++ b/relecov_tools/build_schema.py @@ -4,6 +4,7 @@ import rich.console import pandas as pd import os +import re import openpyxl import sys import json @@ -14,6 +15,7 @@ import relecov_tools.assets.schema_utils.jsonschema_draft import relecov_tools.assets.schema_utils.metadatalab_template from relecov_tools.config_json import ConfigJson +from datetime import datetime from openpyxl.worksheet.datavalidation import DataValidation log = logging.getLogger(__name__) @@ -33,6 +35,7 @@ def __init__( draft_version=None, show_diff=None, out_dir=None, + version=None, ): """ Initialize the SchemaBuilder class. This class generates a JSON Schema file based on the provided draft version. @@ -53,6 +56,9 @@ def __init__( else: self.output_folder = out_dir + # Get version option + self.version = version + # Validate show diff option if not show_diff: self.show_diff = None @@ -313,6 +319,7 @@ def build_new_schema(self, json_data, schema_draft): "label_name": "label", "fill_mode": "fill_mode", "required (Y/N)": "required", + "submitting_lab_form": "header", } required_property_unique = [] @@ -424,10 +431,10 @@ def get_schema_diff(self, base_schema, new_schema): if not diff_lines: log.info( - "No differencess were found between already installed and new generated schema. Exiting. No changes made" + "No differences were found between already installed and new generated schema. Exiting. No changes made" ) stderr.print( - "[yellow]No differencess were found between already installed and new generated schema. Exiting. No changes made" + "[yellow]No differences were found between already installed and new generated schema. Exiting. No changes made" ) return None else: @@ -441,11 +448,11 @@ def get_schema_diff(self, base_schema, new_schema): def print_save_schema_diff(self, diff_lines=None): # Set user's choices - choices = ["Print to sandard output (stdout)", "Save to file", "Both"] + choices = ["Print to standard output (stdout)", "Save to file", "Both"] diff_output_choice = relecov_tools.utils.prompt_selection( "How would you like to print the diff between schemes?:", choices ) - if diff_output_choice in ["Print to sandard output (stdout)", "Both"]: + if diff_output_choice in ["Print to standard output (stdout)", "Both"]: for line in diff_lines: print(line) return True @@ -471,7 +478,10 @@ def save_new_schema(self, json_data): bool: True if the schema was successfully saved, False otherwise. """ try: - path_to_save = self.output_folder + "/relecov_schema.json" + if not self.version: + raise ValueError("The next_version variable is not set.") + + path_to_save = f"{self.output_folder}/relecov_schema_v{self.version}.json" with open(path_to_save, "w") as schema_file: json.dump(json_data, schema_file, ensure_ascii=False, indent=4) log.info(f"New JSON schema saved to: {path_to_save}") @@ -492,26 +502,90 @@ def save_new_schema(self, json_data): # FIXME: overview-tab - Still need to add the column that maps to tab metadatalab def create_metadatalab_excel(self, json_schema): """ - Generate an Excel template file for Metadata LAB with three tabs: Overview, Metadata LAB, and Data Validation. + Generates an Excel template file for Metadata LAB with four sheets: + Overview, Metadata LAB, Data Validation, and Version History. Args: - json_schema (dict): The JSON Schema from which the Excel template is generated. It should include properties and required fields. + json_schema (dict): The JSON schema used to generate the template. + It should include properties and required fields. Returns: - None: if any error occurs during the process. + None: If an error occurs during the process. """ try: - # Set up metadatalab configuration + # Retrieve existing files in the output directory + output_files = os.listdir(self.output_folder) + notes_control_input = input( + "\033[93mEnter a note about changes made to the schema: \033[0m" + ) + # Identify existing template files + template_files = [ + f for f in output_files if f.startswith("Relecov_metadata_template") + ] + if template_files: + # Extract the latest version number from existing files + latest_file = max( + template_files, + key=lambda x: ( + re.search(r"v(\d+\.\d+\.\d+)", x).group(1) + if re.search(r"v(\d+\.\d+\.\d+)", x) + else "0" + ), + ) + match = re.search(r"v(\d+\.\d+\.\d+)", latest_file) + if match: + # Load the latest template file and attempt to read version history + out_file = os.path.join(self.output_folder, latest_file) + version_history = pd.DataFrame( + columns=["FILE_VERSION", "CODE", "NOTES CONTROL", "DATE"] + ) + + try: + wb = openpyxl.load_workbook(out_file) + if "VERSION" in wb.sheetnames: + ws_version = wb["VERSION"] + data = ws_version.values + columns = next(data) + version_history = pd.DataFrame(data, columns=columns) + except Exception as e: + log.warning(f"Error reading previous VERSION sheet: {e}") + next_version = self.version + else: + next_version = "1.0.0" + out_file = os.path.join( + self.output_folder, + f"Relecov_metadata_template_v{next_version}.xlsx", + ) + else: + next_version = "1.0.0" + out_file = os.path.join( + self.output_folder, + f"Relecov_metadata_template_v{next_version}.xlsx", + ) + # Store versioning information + version_info = { + "FILE_VERSION": f"Relecov_metadata_template_v{next_version}", + "CODE": next_version, + "NOTES CONTROL": notes_control_input, + "DATE": datetime.now().strftime("%Y-%m-%d"), + } + version_history = pd.concat( + [version_history, pd.DataFrame([version_info])], ignore_index=True + ) out_file = os.path.join( - self.output_folder, "metadatalab_template" + ".xlsx" + self.output_folder, f"Relecov_metadata_template_v{next_version}.xlsx" ) + + # Define required metadata classifications required_classification = [ "Database Identifiers", "Sample collection and processing", "Host information", "Sequencing", - "Pathogen Diagnostic testing", + "Pathogen diagnostic testing", "Contributor Acknowledgement", + "Public databases", + "Bioinformatics and QC metrics fields", ] required_properties = json_schema.get("required") schema_properties = json_schema.get("properties") @@ -524,6 +598,7 @@ def create_metadatalab_excel(self, json_schema): df = relecov_tools.assets.schema_utils.metadatalab_template.schema_properties_to_df( schema_properties_flatten ) + # Filter metadata fields based on required classifications df = df[df["classification"].isin(required_classification)] df["required"] = df["property_id"].apply( lambda x: "Y" if x in required_properties else "N" @@ -557,25 +632,34 @@ def create_metadatalab_excel(self, json_schema): log.error(f"Error creating overview sheet: {e}") stderr.print(f"Error creating overview sheet: {e}") return None - - # MetadataLab sheet + # Ensure 'header' column exists before filtering + if "header" in df.columns: + df["header"] = df["header"].astype(str).str.strip() + df_filtered = df[df["header"].str.upper() == "Y"] + else: + log.warning( + "No se encontró la columna 'header', usando df sin filtrar." + ) + df_filtered = df + # Create Metadata LAB sheet try: - metadatalab_header = ["EJEMPLOS", "DESCRIPCIÓN", "CAMPO"] + metadatalab_header = ["REQUERIDO", "EJEMPLOS", "DESCRIPCIÓN", "CAMPO"] df_metadata = pd.DataFrame( columns=[col_name for col_name in metadatalab_header] ) - df_metadata["EJEMPLOS"] = df["examples"].apply( + df_metadata["REQUERIDO"] = df_filtered["required"] + df_metadata["EJEMPLOS"] = df_filtered["examples"].apply( lambda x: x[0] if isinstance(x, list) else x ) - df_metadata["DESCRIPCIÓN"] = df["description"] - df_metadata["CAMPO"] = df["label"] + df_metadata["DESCRIPCIÓN"] = df_filtered["description"] + df_metadata["CAMPO"] = df_filtered["label"] df_metadata = df_metadata.transpose() except Exception as e: log.error(f"Error creating MetadataLab sheet: {e}") stderr.print(f"[red]Error creating MetadataLab sheet: {e}") return None - # DataValidation sheet + # Create Data Validation sheet try: datavalidation_header = ["EJEMPLOS", "DESCRIPCIÓN", "CAMPO"] df_hasenum = df[(pd.notnull(df.enum))] @@ -594,8 +678,7 @@ def create_metadatalab_excel(self, json_schema): return None try: - # Since enums have different lengths we need further processing. - # Convert df into dict to perform data manipulation. + enum_dict = {property: [] for property in df_hasenum["property_id"]} enum_maxitems = 0 # Populate the dictionary with flattened lists @@ -634,6 +717,23 @@ def create_metadatalab_excel(self, json_schema): stderr.print(f"[red]Error processing enums and combining data: {e}") return None + # Replace NaN, Inf values with empty strings + df_overview = ( + df_overview.replace([float("inf"), float("-inf")], "") + .fillna("") + .infer_objects() + ) + df_metadata = ( + df_metadata.replace([float("inf"), float("-inf")], "") + .fillna("") + .infer_objects() + ) + df_validation = ( + df_validation.replace([float("inf"), float("-inf")], "") + .fillna("") + .infer_objects() + ) + # WRITE EXCEL try: writer = pd.ExcelWriter(out_file, engine="xlsxwriter") @@ -661,6 +761,7 @@ def create_metadatalab_excel(self, json_schema): have_index=True, have_header=False, ) + version_history.to_excel(writer, sheet_name="VERSION", index=False) writer.close() log.info(f"Metadata lab template successfuly created in: {out_file}") stderr.print( @@ -674,7 +775,8 @@ def create_metadatalab_excel(self, json_schema): try: wb = openpyxl.load_workbook(out_file) ws_metadata = wb["METADATA_LAB"] - + ws_metadata.freeze_panes = "D1" + ws_metadata.delete_rows(5) ws_dropdowns = ( wb.create_sheet("DROPDOWNS") if "DROPDOWNS" not in wb.sheetnames @@ -716,7 +818,47 @@ def create_metadatalab_excel(self, json_schema): ws_metadata.add_data_validation(dropdown) dropdown.add(dropdown_range_metadata) + + if "OVERVIEW" in wb.sheetnames: + ws_overview = wb["OVERVIEW"] + ws_overview.protection.sheet = True + ws_overview.protection.password = "password123" + + if "DATA_VALIDATION" in wb.sheetnames: + ws_data_validation = wb["DATA_VALIDATION"] + ws_data_validation.protection.sheet = True + ws_data_validation.protection.password = "password123" + + if "VERSION" in wb.sheetnames: + ws_data_validation = wb["VERSION"] + ws_data_validation.protection.sheet = True + ws_data_validation.protection.password = "password123" + + ws_version = wb["VERSION"] + column_widths = [] + + for col in ws_version.columns: + max_length = 0 + column = col[0].column_letter + for cell in col: + try: + if len(str(cell.value)) > max_length: + max_length = len(cell.value) + except: + pass + adjusted_width = max_length + 2 + column_widths.append(adjusted_width) + + # Apply the calculated column width + for i, width in enumerate(column_widths): + ws_version.column_dimensions[ + openpyxl.utils.get_column_letter(i + 1) + ].width = width + ws_dropdowns.sheet_state = "hidden" + ws_dropdowns.protection.sheet = True + ws_dropdowns.protection.password = "password123" + wb.save(out_file) except Exception as e: log.error(f"Error adding dropdowns: {e}")