From 0974b39b261f96a8606bcc2c9074ae7c0d08255e Mon Sep 17 00:00:00 2001 From: Alejandro Date: Tue, 11 Feb 2025 13:16:53 +0100 Subject: [PATCH 1/8] Update build-schema "options" column interpretation --- relecov_tools/build_schema.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/relecov_tools/build_schema.py b/relecov_tools/build_schema.py index 5e8623c8..953caf35 100644 --- a/relecov_tools/build_schema.py +++ b/relecov_tools/build_schema.py @@ -342,6 +342,7 @@ def build_new_schema(self, json_data, schema_draft): "ontology_id": "ontology", "type": "type", "format": "format", + "options": "options", "description": "description", "classification": "classification", "label_name": "label", @@ -390,6 +391,26 @@ def build_new_schema(self, json_data, schema_draft): is_required = str(db_features_dic[db_feature_key]) if is_required != "nan": required_property[property_id] = is_required + elif db_feature_key == "options": + options_value = str(db_features_dic.get("options", "")).strip() + if options_value: + options_dict = {} + options_list = options_value.split(",") + + for option in options_list: + key_value = option.split(":") + if len(key_value) == 2: + key = key_value[0].strip() + value = key_value[1].strip() + try: + if "." in value: + value = float(value) + else: + value = int(value) + except ValueError: + pass + options_dict[key] = value + schema_property.update(options_dict) else: std_json_feature = self.standard_jsonschema_object( db_features_dic, db_feature_key From 86b612280df771b44925278c102490303d554ff4 Mon Sep 17 00:00:00 2001 From: Alejandro Date: Tue, 11 Feb 2025 13:17:47 +0100 Subject: [PATCH 2/8] Update read_lab_metadata field formating --- relecov_tools/read_lab_metadata.py | 58 +++++++++++++++++------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/relecov_tools/read_lab_metadata.py b/relecov_tools/read_lab_metadata.py index 792e76b8..ebc4245a 100755 --- a/relecov_tools/read_lab_metadata.py +++ b/relecov_tools/read_lab_metadata.py @@ -394,9 +394,6 @@ def read_metadata_file(self): ws_metadata_lab, heading_row_number = relecov_tools.utils.read_excel_file( self.metadata_file, alt_sheet, header_flag, leave_empty=False ) - alt_header_dict = self.configuration.get_topic_data( - "lab_metadata", "alt_heading_equivalences" - ) valid_metadata_rows = [] included_sample_ids = [] row_number = heading_row_number @@ -408,6 +405,7 @@ def read_metadata_file(self): except KeyError: self.logsum.add_error(entry=f"No {sample_id_col} found in excel file") continue + # Validations on the sample_id if sample_id in included_sample_ids: log_text = f"Skipped duplicated sample {sample_id} in row {row_number}. Sequencing sample id must be unique" self.logsum.add_warning(entry=log_text) @@ -419,13 +417,42 @@ def read_metadata_file(self): continue included_sample_ids.append(sample_id) for key in row.keys(): - # skip the first column of the Metadata lab file if header_flag in key: continue - if row[key] is None or "not provided" in str(row[key]).lower(): + value = row[key] + # Omitting empty or not provided values + if value is None or "not provided" in str(value).lower(): log_text = f"{key} not provided for sample {sample_id}" self.logsum.add_warning(sample=sample_id, entry=log_text) continue + # Get JSON schema type + schema_key = self.label_prop_dict.get(key, key) + schema_type = ( + self.relecov_sch_json["properties"] + .get(schema_key, {}) + .get("type", "string") + ) + # Conversion of values according to expected type + try: + if schema_type == "integer": + try: + value = int(float(value)) + except (ValueError, TypeError): + value = str(value).strip() + elif schema_type == "number": + try: + value = int(float(value)) + except (ValueError, TypeError): + value = float(value).strip() + elif schema_type == "boolean": + value = str(value).strip().lower() in ["true", "yes", "1"] + elif schema_type == "string": + value = str(value).strip() + except (ValueError, TypeError) as e: + log_text = f"Type conversion error for {key} (expected {schema_type}): {value}. {str(e)}" + self.logsum.add_error(sample=sample_id, entry=log_text) + stderr.print(f"[red]{log_text}") + continue if "date" in key.lower(): # Check if date is a string. Format YYYY/MM/DD to YYYY-MM-DD pattern = r"^\d{4}[-/.]\d{2}[-/.]\d{2}" @@ -453,31 +480,14 @@ def read_metadata_file(self): logtxt = f"Non-date field {key} provided as date. Parsed as int" self.logsum.add_warning(sample=sample_id, entry=logtxt) row[key] = str(relecov_tools.utils.excel_date_to_num(row[key])) - if self.alternative_heading: - alt_key = alt_header_dict.get(key) - if row[key] is not None or "not provided" not in str(row[key]).lower(): - try: - property_row[self.label_prop_dict[key]] = str(row[key]).strip() - except KeyError as e: - if self.alternative_heading: - try: - property_row[self.label_prop_dict[alt_key]] = str( - row[key] - ).strip() - continue - except KeyError: - pass - log_text = f"Error when mapping the label {str(e)}" - self.logsum.add_error(sample=sample_id, entry=log_text) - stderr.print(f"[red]{log_text}") - continue + property_row[schema_key] = value valid_metadata_rows.append(property_row) - return valid_metadata_rows def create_metadata_json(self): stderr.print("[blue]Reading Lab Metadata Excel File") valid_metadata_rows = self.read_metadata_file() + stderr.print(f"[green]Processed {len(valid_metadata_rows)} valid metadata rows") clean_metadata_rows, missing_samples = self.match_to_json(valid_metadata_rows) if missing_samples: num_miss = len(missing_samples) From 3bc861bd7083b24b4fd5a27296dca490a46a0615 Mon Sep 17 00:00:00 2001 From: Alejandro Date: Tue, 11 Feb 2025 13:43:21 +0100 Subject: [PATCH 3/8] update linting --- relecov_tools/build_schema.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/relecov_tools/build_schema.py b/relecov_tools/build_schema.py index 953caf35..beafceed 100644 --- a/relecov_tools/build_schema.py +++ b/relecov_tools/build_schema.py @@ -392,7 +392,9 @@ def build_new_schema(self, json_data, schema_draft): if is_required != "nan": required_property[property_id] = is_required elif db_feature_key == "options": - options_value = str(db_features_dic.get("options", "")).strip() + options_value = str( + db_features_dic.get("options", "") + ).strip() if options_value: options_dict = {} options_list = options_value.split(",") From 49c968f123e6a4342aa0712ac184ac4f4567bf05 Mon Sep 17 00:00:00 2001 From: Alejandro Date: Tue, 11 Feb 2025 13:45:12 +0100 Subject: [PATCH 4/8] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 65eb6acb..b1f3ab5c 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Code contributions to the release: - Added a more robust datatype handling in utils.py read_csv_file_return_dict() method [#379](https://github.com/BU-ISCIII/relecov-tools/pull/379) - Improved relecov template generator and version control [#382](https://github.com/BU-ISCIII/relecov-tools/pull/382) +- Improve "options" interpretation in build-schema and update read-lab-metadata field type [#388](https://github.com/BU-ISCIII/relecov-tools/pull/388) #### Fixes From fcea31c70f9ec89b81996d995b61d4211f38fabd Mon Sep 17 00:00:00 2001 From: Alejandro Date: Wed, 12 Feb 2025 08:36:01 +0100 Subject: [PATCH 5/8] Update column "options" --- relecov_tools/build_schema.py | 1 - 1 file changed, 1 deletion(-) diff --git a/relecov_tools/build_schema.py b/relecov_tools/build_schema.py index beafceed..45d5b72e 100644 --- a/relecov_tools/build_schema.py +++ b/relecov_tools/build_schema.py @@ -341,7 +341,6 @@ def build_new_schema(self, json_data, schema_draft): "examples": "examples", "ontology_id": "ontology", "type": "type", - "format": "format", "options": "options", "description": "description", "classification": "classification", From e1e7d23dbfe092622e1fc3077bf9c8b052d00e36 Mon Sep 17 00:00:00 2001 From: Alejandro Date: Wed, 12 Feb 2025 08:36:28 +0100 Subject: [PATCH 6/8] Apply verification to "format:date" --- relecov_tools/json_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/relecov_tools/json_validation.py b/relecov_tools/json_validation.py index 516377bb..2a7c0a27 100755 --- a/relecov_tools/json_validation.py +++ b/relecov_tools/json_validation.py @@ -1,7 +1,7 @@ #!/usr/bin/env python import logging import rich.console -from jsonschema import Draft202012Validator +from jsonschema import Draft202012Validator, FormatChecker import sys import os import openpyxl @@ -110,7 +110,7 @@ def validate_instances(self): """Validate data instances against a validated json schema""" # create validator - validator = Draft202012Validator(self.json_schema) + validator = Draft202012Validator(self.json_schema, format_checker=FormatChecker()) schema_props = self.json_schema["properties"] validated_json_data = [] From b7e97772b073a80020ff7276c27b515e6a897998 Mon Sep 17 00:00:00 2001 From: Alejandro Date: Wed, 12 Feb 2025 08:39:42 +0100 Subject: [PATCH 7/8] Fix black lint --- relecov_tools/json_validation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/relecov_tools/json_validation.py b/relecov_tools/json_validation.py index 2a7c0a27..f85166b8 100755 --- a/relecov_tools/json_validation.py +++ b/relecov_tools/json_validation.py @@ -110,7 +110,9 @@ def validate_instances(self): """Validate data instances against a validated json schema""" # create validator - validator = Draft202012Validator(self.json_schema, format_checker=FormatChecker()) + validator = Draft202012Validator( + self.json_schema, format_checker=FormatChecker() + ) schema_props = self.json_schema["properties"] validated_json_data = [] From 024f73e43175ea67325e78ba6d76cb4474562371 Mon Sep 17 00:00:00 2001 From: Alejandro Date: Wed, 12 Feb 2025 09:44:58 +0100 Subject: [PATCH 8/8] Fix float to int read-lab-metadata --- relecov_tools/read_lab_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/relecov_tools/read_lab_metadata.py b/relecov_tools/read_lab_metadata.py index ebc4245a..aa01d769 100755 --- a/relecov_tools/read_lab_metadata.py +++ b/relecov_tools/read_lab_metadata.py @@ -441,7 +441,7 @@ def read_metadata_file(self): value = str(value).strip() elif schema_type == "number": try: - value = int(float(value)) + value = float(value) except (ValueError, TypeError): value = float(value).strip() elif schema_type == "boolean":