Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve "options" interpretation in build-schema and update read-lab-metadata field type #388

Merged
merged 8 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Code contributions to the release:

- Added a more robust datatype handling in utils.py read_csv_file_return_dict() method [#379](https://github.com/BU-ISCIII/relecov-tools/pull/379)
- Improved relecov template generator and version control [#382](https://github.com/BU-ISCIII/relecov-tools/pull/382)
- Improve "options" interpretation in build-schema and update read-lab-metadata field type [#388](https://github.com/BU-ISCIII/relecov-tools/pull/388)

#### Fixes

Expand Down
24 changes: 23 additions & 1 deletion relecov_tools/build_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def build_new_schema(self, json_data, schema_draft):
"examples": "examples",
"ontology_id": "ontology",
"type": "type",
"format": "format",
"options": "options",
"description": "description",
"classification": "classification",
"label_name": "label",
Expand Down Expand Up @@ -390,6 +390,28 @@ def build_new_schema(self, json_data, schema_draft):
is_required = str(db_features_dic[db_feature_key])
if is_required != "nan":
required_property[property_id] = is_required
elif db_feature_key == "options":
options_value = str(
db_features_dic.get("options", "")
).strip()
if options_value:
options_dict = {}
options_list = options_value.split(",")

for option in options_list:
key_value = option.split(":")
if len(key_value) == 2:
key = key_value[0].strip()
value = key_value[1].strip()
try:
if "." in value:
value = float(value)
else:
value = int(value)
except ValueError:
pass
options_dict[key] = value
schema_property.update(options_dict)
else:
std_json_feature = self.standard_jsonschema_object(
db_features_dic, db_feature_key
Expand Down
6 changes: 4 additions & 2 deletions relecov_tools/json_validation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
import logging
import rich.console
from jsonschema import Draft202012Validator
from jsonschema import Draft202012Validator, FormatChecker
import sys
import os
import openpyxl
Expand Down Expand Up @@ -110,7 +110,9 @@ def validate_instances(self):
"""Validate data instances against a validated json schema"""

# create validator
validator = Draft202012Validator(self.json_schema)
validator = Draft202012Validator(
self.json_schema, format_checker=FormatChecker()
)
schema_props = self.json_schema["properties"]

validated_json_data = []
Expand Down
58 changes: 34 additions & 24 deletions relecov_tools/read_lab_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,9 +394,6 @@ def read_metadata_file(self):
ws_metadata_lab, heading_row_number = relecov_tools.utils.read_excel_file(
self.metadata_file, alt_sheet, header_flag, leave_empty=False
)
alt_header_dict = self.configuration.get_topic_data(
"lab_metadata", "alt_heading_equivalences"
)
valid_metadata_rows = []
included_sample_ids = []
row_number = heading_row_number
Expand All @@ -408,6 +405,7 @@ def read_metadata_file(self):
except KeyError:
self.logsum.add_error(entry=f"No {sample_id_col} found in excel file")
continue
# Validations on the sample_id
if sample_id in included_sample_ids:
log_text = f"Skipped duplicated sample {sample_id} in row {row_number}. Sequencing sample id must be unique"
self.logsum.add_warning(entry=log_text)
Expand All @@ -419,13 +417,42 @@ def read_metadata_file(self):
continue
included_sample_ids.append(sample_id)
for key in row.keys():
# skip the first column of the Metadata lab file
if header_flag in key:
continue
if row[key] is None or "not provided" in str(row[key]).lower():
value = row[key]
# Omitting empty or not provided values
if value is None or "not provided" in str(value).lower():
log_text = f"{key} not provided for sample {sample_id}"
self.logsum.add_warning(sample=sample_id, entry=log_text)
continue
# Get JSON schema type
schema_key = self.label_prop_dict.get(key, key)
schema_type = (
self.relecov_sch_json["properties"]
.get(schema_key, {})
.get("type", "string")
)
# Conversion of values according to expected type
try:
if schema_type == "integer":
try:
value = int(float(value))
except (ValueError, TypeError):
value = str(value).strip()
elif schema_type == "number":
try:
value = float(value)
except (ValueError, TypeError):
value = float(value).strip()
elif schema_type == "boolean":
value = str(value).strip().lower() in ["true", "yes", "1"]
elif schema_type == "string":
value = str(value).strip()
except (ValueError, TypeError) as e:
log_text = f"Type conversion error for {key} (expected {schema_type}): {value}. {str(e)}"
self.logsum.add_error(sample=sample_id, entry=log_text)
stderr.print(f"[red]{log_text}")
continue
if "date" in key.lower():
# Check if date is a string. Format YYYY/MM/DD to YYYY-MM-DD
pattern = r"^\d{4}[-/.]\d{2}[-/.]\d{2}"
Expand Down Expand Up @@ -453,31 +480,14 @@ def read_metadata_file(self):
logtxt = f"Non-date field {key} provided as date. Parsed as int"
self.logsum.add_warning(sample=sample_id, entry=logtxt)
row[key] = str(relecov_tools.utils.excel_date_to_num(row[key]))
if self.alternative_heading:
alt_key = alt_header_dict.get(key)
if row[key] is not None or "not provided" not in str(row[key]).lower():
try:
property_row[self.label_prop_dict[key]] = str(row[key]).strip()
except KeyError as e:
if self.alternative_heading:
try:
property_row[self.label_prop_dict[alt_key]] = str(
row[key]
).strip()
continue
except KeyError:
pass
log_text = f"Error when mapping the label {str(e)}"
self.logsum.add_error(sample=sample_id, entry=log_text)
stderr.print(f"[red]{log_text}")
continue
property_row[schema_key] = value
valid_metadata_rows.append(property_row)

return valid_metadata_rows

def create_metadata_json(self):
stderr.print("[blue]Reading Lab Metadata Excel File")
valid_metadata_rows = self.read_metadata_file()
stderr.print(f"[green]Processed {len(valid_metadata_rows)} valid metadata rows")
clean_metadata_rows, missing_samples = self.match_to_json(valid_metadata_rows)
if missing_samples:
num_miss = len(missing_samples)
Expand Down
Loading