From d0091285c8f58df99b70ead209df4cedd5ede004 Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Wed, 31 Jul 2024 13:42:02 -0700 Subject: [PATCH 01/24] Added --rename-headers functionality --- pds4indextools/pds4_create_xml_index.py | 48 ++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index c34edd1..260632e 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -123,6 +123,12 @@ def correct_duplicates(label_results): if number.isdigit(): cropped = tag.replace('_'+number, '') if cropped in element_names: + if str(cropped+'_'+number+'<1>') in key: + key_new = key.replace((cropped+'_'+str((int(number)+1))+'<1>'), + cropped+'<1>') + else: + key_new = key.replace((cropped+'_'+str((int(number)+1))), + cropped+'<1>') key_new = key.replace(('_' + number + '<1>'), '<1>') parent = key_new.split('/')[-2].split('<')[0] key_new = key_new.replace(parent+'<1>', parent+'<'+str(int(number)+1)+'>') @@ -406,7 +412,7 @@ def process_headers(label_results, key, root, namespaces, prefixes): label_results[key_new] = label_results.pop(key) -def renumber_xpaths(xpaths): +def renumber_xpaths(xpaths, args): """ Renumber a list of XPaths to be sequential at each level. @@ -458,6 +464,8 @@ def renumber_xpaths(xpaths): Parameters: xpaths (list): The list of XPaths or XPath fragments. + args (argparse.Namespace): Arguments parsed from command line using argparse. + Returns: dict: A dictionary containing a mapping from the original XPaths to the @@ -511,7 +519,10 @@ def split_xpath_prefix_and_num(s): # increasing starting at 1. We also add a special entry for the empty # suffix when there is no number. unique_nums = sorted({x.num for x in prefix_group_list if x.num is not None}) - renumber_map = {x: f'<{i+1}>' for i, x in enumerate(unique_nums)} + if args.dont_number_unique_tags and len(unique_nums) == 1: + renumber_map = {x: '' for x in unique_nums} + else: + renumber_map = {x: f'<{i+1}>' for i, x in enumerate(unique_nums)} renumber_map[None] = '' # We further group these by unique parent (including the number) @@ -527,7 +538,7 @@ def split_xpath_prefix_and_num(s): # down. children = [x for x in parent_group_list if x.child is not None] if children: - child_map = renumber_xpaths([x.child for x in children]) + child_map = renumber_xpaths([x.child for x in children], args) xpath_map.update( { f'{x.parent}/{x.child}': ( @@ -569,6 +580,20 @@ def split_into_elements(xpath): return elements +def replace_columns(filepath, df): + # Create an empty dictionary to store column mappings + column_mappings = {} + + # Read the file and populate the dictionary + with open(filepath, 'r') as file: + for line in file: + old_name, new_name = line.strip().split(', ') + column_mappings[old_name] = new_name + + # Step 2: Rename the columns using the mappings + df.rename(columns=column_mappings, inplace=True) + + def store_element_text(element, tree, results_dict, xsd_files, nillable_elements_info, config, label_filename): """ @@ -762,6 +787,9 @@ def pad_column_values_and_headers(df): df = pd.DataFrame(rows) + if args.rename_headers: + replace_columns(args.rename_headers, df) + if args.sort_by: sort_values = str(args.sort_by).split(',') try: @@ -1176,6 +1204,15 @@ def main(cmd_line=None): 'contain characters permissible in variable ' 'names.') + index_file_generation.add_argument('--rename-headers', type=str, + metavar='NEW_HEADERS_FILEPATH', + help='Rename headers in the generated index file' + 'according to a given mapping file.') + + index_file_generation.add_argument('--dont-number-unique-tags', action='store_true', + help='Removes the predicates of unique XPath ' + 'headers.') + index_file_generation.add_argument( '--simplify-xpaths', action='store_true', @@ -1347,11 +1384,12 @@ def main(cmd_line=None): # the column refers to. At this stage, duplicate XPaths may exist again due to # the reformatting. These duplicates are corrected to preserve the contents of # each element's value. - xpath_map = renumber_xpaths(label_results) + correct_duplicates(label_results) + xpath_map = renumber_xpaths(label_results, args) for old_xpath, new_xpath in xpath_map.items(): label_results[new_xpath] = label_results.pop(old_xpath) - correct_duplicates(label_results) + # correct_duplicates(label_results) # Collect metadata about the label file. The label file's lid is scraped and # broken into multiple parts. This metadata can then be requested as additional From e114d3e48d26227df93c07eb7b61a7764f476e9a Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Mon, 19 Aug 2024 13:55:50 -0700 Subject: [PATCH 02/24] Updated code, got to 100% unit test coverage --- pds4indextools/index_label_template_pds.xml | 4 + pds4indextools/pds4_create_xml_index.py | 356 +++++++++-------- .../clean_header_field_names_success_2.csv | 2 + test_files/expected/index_file_success.csv | 2 + test_files/expected/label_success_1.csv | 2 + test_files/expected/label_success_1.xml | 116 ++++++ test_files/expected/label_success_2.csv | 2 + test_files/expected/label_success_2.xml | 115 ++++++ test_files/expected/label_success_3.csv | 4 + test_files/expected/label_success_3.xml | 102 +++++ .../expected/limit_xpaths_file_success_1.csv | 2 + .../expected/simplify_xpaths_success_1.txt | 14 +- .../expected/simplify_xpaths_success_3.txt | 62 +-- .../expected/simplify_xpaths_success_4.txt | 128 +++---- test_files/expected/tester_config.yaml | 4 +- test_files/expected/tester_config_label.yaml | 13 + .../expected/tester_config_nillable.yaml | 18 + test_files/labels/bad_lid_label.xml | 25 ++ .../samples/element_extra_file_info.txt | 4 + tests/test_pds4_create_xml_index_blackbox.py | 359 ++++++++++++++---- tests/test_pds4_create_xml_index_whitebox.py | 309 ++++++++++++++- 21 files changed, 1281 insertions(+), 362 deletions(-) create mode 100644 test_files/expected/clean_header_field_names_success_2.csv create mode 100644 test_files/expected/index_file_success.csv create mode 100644 test_files/expected/label_success_1.csv create mode 100644 test_files/expected/label_success_1.xml create mode 100644 test_files/expected/label_success_2.csv create mode 100644 test_files/expected/label_success_2.xml create mode 100644 test_files/expected/label_success_3.csv create mode 100644 test_files/expected/label_success_3.xml create mode 100644 test_files/expected/limit_xpaths_file_success_1.csv create mode 100644 test_files/expected/tester_config_label.yaml create mode 100644 test_files/expected/tester_config_nillable.yaml create mode 100644 test_files/labels/bad_lid_label.xml create mode 100644 test_files/samples/element_extra_file_info.txt diff --git a/pds4indextools/index_label_template_pds.xml b/pds4indextools/index_label_template_pds.xml index 8699cdc..89b6bb6 100644 --- a/pds4indextools/index_label_template_pds.xml +++ b/pds4indextools/index_label_template_pds.xml @@ -107,7 +107,11 @@ $END_IF $BASENAME(TEMPFILE)$ index-table + $IF(File) + $File['creation_date_time']$ + $ELSE $DATETIME(creation_date_time)$ + $END_IF $FILE_MD5(TEMPFILE)$ diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index 260632e..96d4d12 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -124,12 +124,9 @@ def correct_duplicates(label_results): cropped = tag.replace('_'+number, '') if cropped in element_names: if str(cropped+'_'+number+'<1>') in key: - key_new = key.replace((cropped+'_'+str((int(number)+1))+'<1>'), - cropped+'<1>') + key_new = key.replace((cropped+'_'+number+'<1>'), cropped+'<1>') else: - key_new = key.replace((cropped+'_'+str((int(number)+1))), - cropped+'<1>') - key_new = key.replace(('_' + number + '<1>'), '<1>') + key_new = key.replace(cropped+'_'+number, cropped+'<1>') parent = key_new.split('/')[-2].split('<')[0] key_new = key_new.replace(parent+'<1>', parent+'<'+str(int(number)+1)+'>') label_results[key_new] = label_results.pop(key) @@ -302,6 +299,33 @@ def filter_dict_by_glob_patterns(input_dict, glob_patterns, valid_add_extra_file return filtered_dict +def get_true_type(xsd_files, tag, namespaces): + def search_type(xsd_file, tag, namespaces): + print(f"Processing file: {xsd_file}") + xsd_tree = download_xsd_file(xsd_file) + namespaces = scrape_namespaces(xsd_tree) + true_type = find_base_attribute(xsd_tree, tag, namespaces) + if true_type: + print(f"Found true_type for tag '{tag}' in file: {xsd_file}") + return true_type + + # Check for modified tag if the first search does not find a match + modified_tag = tag + "_WO_Units" + true_type = find_base_attribute(xsd_tree, modified_tag, namespaces) + if true_type: + print(f"Found true_type for modified tag '{modified_tag}' in file: {xsd_file}") + return true_type # This will return either the found type or None + + for xsd_file in xsd_files: + true_type = search_type(xsd_file, tag, namespaces) + if true_type: # Only return if true_type is not None + print(f"Returning true_type found in file: {xsd_file}") + return true_type + + print("No true_type found in any file.") + return None # Return None if no match is found in any file + + def load_config_file( default_config_file=Path(__file__).resolve().parent/'default_config.yaml', specified_config_files=None): @@ -519,10 +543,7 @@ def split_xpath_prefix_and_num(s): # increasing starting at 1. We also add a special entry for the empty # suffix when there is no number. unique_nums = sorted({x.num for x in prefix_group_list if x.num is not None}) - if args.dont_number_unique_tags and len(unique_nums) == 1: - renumber_map = {x: '' for x in unique_nums} - else: - renumber_map = {x: f'<{i+1}>' for i, x in enumerate(unique_nums)} + renumber_map = {x: f'<{i+1}>' for i, x in enumerate(unique_nums)} renumber_map[None] = '' # We further group these by unique parent (including the number) @@ -580,20 +601,6 @@ def split_into_elements(xpath): return elements -def replace_columns(filepath, df): - # Create an empty dictionary to store column mappings - column_mappings = {} - - # Read the file and populate the dictionary - with open(filepath, 'r') as file: - for line in file: - old_name, new_name = line.strip().split(', ') - column_mappings[old_name] = new_name - - # Step 2: Rename the columns using the mappings - df.rename(columns=column_mappings, inplace=True) - - def store_element_text(element, tree, results_dict, xsd_files, nillable_elements_info, config, label_filename): """ @@ -630,17 +637,12 @@ def store_element_text(element, tree, results_dict, xsd_files, nillable_elements xsd_tree = download_xsd_file(xsd_file) namespaces = scrape_namespaces(xsd_tree) true_type = find_base_attribute(xsd_tree, tag, namespaces) - if true_type: - break # Exit the loop once true_type is found - - if not true_type: - modified_tag = tag + "_WO_Units" - for xsd_file in xsd_files: - namespaces = scrape_namespaces(xsd_tree) + if not true_type: + modified_tag = tag + "_WO_Units" true_type = find_base_attribute(xsd_tree, modified_tag, namespaces) - if true_type: - break + # if true_type: + # break default = default_value_for_nil(config, true_type, nil_value) results_dict[xpath] = default @@ -704,40 +706,39 @@ def update_nillable_elements_from_xsd_file(xsd_file, nillable_elements_info): for element in elements_with_nillable: name = element.get('name') type_attribute = element.get('type') - if type_attribute not in nillable_elements_info: - if type_attribute: - # Split the type attribute to handle namespace:typename format - type_parts = type_attribute.split(':') - # Take the last part as the type name - type_name = type_parts[-1] - - # Attempt to find the type definition in the document - type_definition_xpath = (f'//xs:simpleType[@name="{type_name}"] | ' - f'//xs:complexType[@name="{type_name}"]') - type_definition = tree.xpath( - type_definition_xpath, namespaces=namespace) - - if type_definition: - # Take the first match - type_definition = type_definition[0] - base_type = None - # For complexType with simpleContent or simpleType, find base attr - if type_definition.tag.endswith('simpleType'): - restriction = type_definition.find('.//xs:restriction', - namespaces=namespace) - if restriction is not None: - base_type = restriction.get('base') - elif type_definition.tag.endswith('complexType'): - extension = type_definition.find('.//xs:extension', - namespaces=namespace) - if extension is not None: - base_type = extension.get('base') - - nillable_elements_info[name] = ( - base_type or 'External or built-in type') - else: - # Type definition not found, might be external or built-in type - nillable_elements_info[name] = 'External or built-in type' + if type_attribute: + # Split the type attribute to handle namespace:typename format + type_parts = type_attribute.split(':') + # Take the last part as the type name + type_name = type_parts[-1] + + # Attempt to find the type definition in the document + type_definition_xpath = (f'//xs:simpleType[@name="{type_name}"] | ' + f'//xs:complexType[@name="{type_name}"]') + type_definition = tree.xpath( + type_definition_xpath, namespaces=namespace) + + if type_definition: + # Take the first match + type_definition = type_definition[0] + base_type = None + # For complexType with simpleContent or simpleType, find base attr + + try: + restriction = type_definition.find('.//xs:restriction', + namespaces=namespace) + base_type = restriction.get('base') + + except AttributeError: + extension = type_definition.find('.//xs:extension', + namespaces=namespace) + base_type = extension.get('base') + + nillable_elements_info[name] = ( + base_type or 'External or built-in type') + else: + # Type definition not found, might be external or built-in type + nillable_elements_info[name] = 'External or built-in type' def write_results_to_csv(results_list, args, output_csv_path): @@ -781,24 +782,22 @@ def pad_column_values_and_headers(df): return padded_df + rows = [] for result_dict in results_list: - rows.append(result_dict['Results']) + rows.append(result_dict) df = pd.DataFrame(rows) - if args.rename_headers: - replace_columns(args.rename_headers, df) - if args.sort_by: sort_values = str(args.sort_by).split(',') try: - df.sort_values(by=sort_values, inplace=True) - except KeyError as bad_sort: - print(f'Unknown sort key {bad_sort}. For a list of available sort keys, use ' - f'the --output-headers-file option.') + sort_dataframe(df, sort_values) + except ValueError as bad_sort: + print(bad_sort) sys.exit(1) + if args.clean_header_field_names: clean_headers(df) @@ -838,32 +837,6 @@ def find_base_attribute(xsd_tree, target_name, new_namespaces): } namespaces.update(new_namespaces) - def follow_base_type(base_type): - """ - Recursively follows the base type definitions to find the final base type. - - Parameters: - base_type (str): The initial base type to follow. - - Returns: - str: The final base type. - """ - while True: - if 'ASCII' in base_type or 'UTF8' in base_type: - return base_type - - next_query = ( - f".//xs:simpleType[@name='{base_type.split(':')[-1]}']" - f"//xs:restriction/@base" - ) - try: - next_result = xsd_tree.xpath(next_query, namespaces=namespaces) - except etree.XPathEvalError: - break - if not next_result: - break - base_type = next_result[0] - return base_type def get_base_type(query): """ @@ -875,11 +848,8 @@ def get_base_type(query): Returns: list: The result of the XPath query. """ - try: - result = xsd_tree.xpath(query, namespaces=namespaces) - return result - except etree.XPathEvalError: - return None + result = xsd_tree.xpath(query, namespaces=namespaces) + return result queries = [ f".//xs:complexType[@name='{target_name}']//xs:extension/@base", @@ -933,7 +903,7 @@ def get_base_type(query): result = get_base_type(query) if result: base_type = result[0] - return follow_base_type(base_type) + return base_type return None @@ -955,7 +925,15 @@ def scrape_namespaces(tree): return namespaces -def get_creation_date(file_path): +def sort_dataframe(df, sort_keys): + try: + df.sort_values(by=sort_keys, inplace=True) + except KeyError as bad_sort: + raise ValueError(f'Unknown sort key {bad_sort}. For a list of available sort ' + f'keys, use the --output-headers-file option.') + + +def get_creation_date(file_path): """ Returns the creation date of a file in ISO 8601 format. @@ -973,7 +951,7 @@ def get_creation_date(file_path): stat = os.stat(file_path) try: creation_time = stat.st_birthtime - except AttributeError: + except AttributeError: # pragma: no coverage # Fallback to the last modification time if birth time is not available creation_time = stat.st_mtime @@ -1146,7 +1124,7 @@ def _fill_text(self, text, width, indent): def main(cmd_line=None): epilog_sfx = '' - if __version__ != 'Version unspecified': + if __version__ != 'Version unspecified': # pragma: no coverage epilog_sfx = f'|nVersion: {__version__}' parser = argparse.ArgumentParser( formatter_class=MultilineFormatter, @@ -1290,10 +1268,19 @@ def main(cmd_line=None): for pattern in patterns: files = directory_path.glob(pattern) - if not files: - verboseprint(f'No files matching {pattern} found in ' - f'directory: {directory_path}') - label_files.extend(files) + + # Create an iterator from the generator + files_iter = iter(files) + + # Use a sentinel object to check if there's any item + sentinel = object() + first_file = next(files_iter, sentinel) + + if first_file is sentinel: + print(f"No files found for pattern: {pattern}") + else: + # If not empty, continue processing and include the first file + label_files.extend(itertools.chain([first_file], files_iter)) verboseprint(f'{len(label_files)} matching file(s) found') @@ -1337,7 +1324,7 @@ def main(cmd_line=None): filepath = str(label_file.relative_to(args.directorypath)).replace('\\', '/') # PDS4 compliant filepaths must be less than 255 characters. - if len(filepath) > 255: + if len(filepath) > 255: # pragma: no coverage print(f'Filepath {filepath} exceeds 255 character limit.') sys.exit(1) @@ -1351,11 +1338,11 @@ def main(cmd_line=None): label_results = {} traverse_and_store(root, tree, label_results, xsd_files, nillable_elements_info, config, label_file) - - # The XPath headers in the label_results dictionary are reformatted to - # improve readability. Each XPath's namespace is replaced with its prefix for - # faster reference. Duplicate XPaths are made unique to ensure all results are - # present in the final product. + + # # The XPath headers in the label_results dictionary are reformatted to + # # improve readability. Each XPath's namespace is replaced with its prefix for + # # faster reference. Duplicate XPaths are made unique to ensure all results are + # # present in the final product. for key in list(label_results): process_headers(label_results, key, root, namespaces, prefixes) @@ -1384,19 +1371,21 @@ def main(cmd_line=None): # the column refers to. At this stage, duplicate XPaths may exist again due to # the reformatting. These duplicates are corrected to preserve the contents of # each element's value. - correct_duplicates(label_results) xpath_map = renumber_xpaths(label_results, args) for old_xpath, new_xpath in xpath_map.items(): label_results[new_xpath] = label_results.pop(old_xpath) - # correct_duplicates(label_results) + correct_duplicates(label_results) # Collect metadata about the label file. The label file's lid is scraped and # broken into multiple parts. This metadata can then be requested as additional # columns within the index file. - lid = extract_logical_identifier(tree) - if lid is None: - lid = label_results.get('pds:logical_identifier', 'Missing_LID') + try: + lid = extract_logical_identifier(tree) + except AttributeError: + print(f"Label file {label_file} does not have a " + f"logical_identifier attribute.") + sys.exit(1) # Attach extra columns if asked for. bundle_lid = ':'.join(lid.split(':')[:4]) @@ -1409,8 +1398,8 @@ def main(cmd_line=None): label_results = {**{ele: extras[ele] for ele in args.add_extra_file_info}, **label_results} - result_dict = {'Results': label_results} - all_results.append(result_dict) + all_results.append(label_results) + if args.add_extra_file_info and elements_to_scrape is not None: elements_to_scrape = args.add_extra_file_info + elements_to_scrape @@ -1419,57 +1408,69 @@ def main(cmd_line=None): # of the --limit-xpaths-file input file. If this command is not used, the original # dictionary will be returned. Glob patterns are processed sequentially, with the # first pattern having the highest priority. - for i in range(len(all_results)): - label_results = all_results[i]['Results'] - label_results = filter_dict_by_glob_patterns( + + for label_results in all_results: + ind = all_results.index(label_results) + label_results_new = filter_dict_by_glob_patterns( label_results, elements_to_scrape, valid_add_extra_file_info, verboseprint) - all_results[i]['Results'] = label_results + all_results[ind] = label_results_new - if all(len(set(r['Results'])) == 0 for r in all_results): + if all(len(set(r)) == 0 for r in all_results): print('No results found: glob pattern(s) excluded all matches.') sys.exit(1) - # If --simplify-xpaths is used, the XPath headers will be shortened to the - # element's tag and namespace prefix. This is contingent on the uniqueness of - # the XPath header; if more than one XPath header shares a tag, a namespace and a - # predicate value, the XPath header will remain whole. + # # If --simplify-xpaths is used, the XPath headers will be shortened to the + # # element's tag and namespace prefix. This is contingent on the uniqueness of + # # the XPath header; if more than one XPath header shares a tag, a namespace and a + # # predicate value, the XPath header will remain whole. if args.simplify_xpaths: - for i in range(len(all_results)): - label_results = all_results[i]['Results'] + headers = {} + unique_tags_master = [] + + # Step 1: Gather all possible tags from labels + for label_results in all_results: + keys = label_results.keys() + for key in keys: + tag = key.split('/')[-1] + tags.append(tag) + if key not in headers: + headers[key] = tag + + # For each label, collect all tags that only occur once. If a unique tag occurs + # multiple times within a label, that tag will be removed from the collective + # list of unique tags. + for label_results in all_results: tags = [] + unique_tags = [] names = [] - - # Step 1: Gather all tags from keys - for key in label_results: - elements = key.split('/') - tag = elements[-1] - name = tag.split('<')[0] + for key in keys: + tag = key.split('/')[-1] tags.append(tag) + name = tag.split('<')[0] names.append(name) - - # Step 2: Find unique tags - unique_tags = [] for tag in tags: name = tag.split('<')[0] - if tags.count(tag) == 1 and names.count(name) == 1: + if (tags.count(tag) == 1 and names.count(name) == 1 + and tag not in unique_tags): unique_tags.append(tag) - - # Step 3: Create a new dictionary to hold modified results + # if tags.count(tag) > 1 and tag in unique_tags_master: + # unique_tags_master.remove(tag) + + for tag in unique_tags: + unique_tags_master.append(tag) + + for label_results in all_results: + ind = all_results.index(label_results) new_label_results = {} - - # Step 4: Iterate over original dictionary to modify and copy to new - # dictionary for key, value in list(label_results.items()): - elements = key.split('/') - tag = elements[-1] - if tag in unique_tags: - new_tag = tag.split('<')[0] - verboseprint(f'XPath header {key} changed to {new_tag}') - new_label_results[new_tag] = value + new_key = headers[key] + if key.split('/')[-1] in unique_tags_master: + new_label_results[new_key] = value else: new_label_results[key] = value - all_results[i]['Results'] = new_label_results + all_results[ind] = new_label_results + if output_csv_path: write_results_to_csv(all_results, args, output_csv_path) @@ -1479,12 +1480,14 @@ def main(cmd_line=None): # the label_results dictionary and place them in the output file, instead of the # index file. if output_txt_path: + if not args.output_index_file: + print('No index file generated because --output-headers-file was ' + 'provided without --output-index-file.') xpaths = [] for label in all_results: - for values in label.values(): - for xpath in values: - if xpath not in xpaths: - xpaths.append(xpath) + for xpath in label: + if xpath not in xpaths: + xpaths.append(xpath) # The file is now written and placed in a given location. If cleaned header # field names are requested, they are processed here before being written in. @@ -1497,14 +1500,12 @@ def main(cmd_line=None): ':', '_').replace('/', '__').replace('<', '_').replace('>', '') output_fp.write("%s\n" % item) print(f'XPath headers file generated at {output_txt_path}.') - if not args.output_index_file: - print('No index file generated because --output-headers-file was ' - 'provided without --output-index-file.') # Generates the label for this index file, if --generate-label is used. if args.generate_label: index_file = output_csv_path + print(index_file) # The template label file is initialized. module_dir = Path(__file__).resolve().parent @@ -1550,7 +1551,7 @@ def main(cmd_line=None): true_type = 'pds:ASCII_LID' elif header == 'filename': true_type = 'pds:ASCII_File_Name' - elif header == filepath: + elif header == 'filepath': true_type = 'pds:ASCII_File_Specification_Name' elif header == 'bundle': true_type = 'pds:ASCII_Text_Preserved' @@ -1558,25 +1559,8 @@ def main(cmd_line=None): parts = header.split('/') name = parts[-1].split('<')[0].split(':')[-1] - true_type = None + true_type = get_true_type(xsd_files, name, namespaces) - for xsd_file in xsd_files: - xsd_tree = download_xsd_file(xsd_file) - true_type = find_base_attribute(xsd_tree, name, namespaces) - if true_type: - break - - if not true_type: - modified_name = name + "_WO_Units" - for xsd_file in xsd_files: - xsd_tree = download_xsd_file(xsd_file) - true_type = find_base_attribute(xsd_tree, modified_name, - namespaces) - if true_type: - break - - if true_type is None: - true_type = ':inapplicable' true_type = true_type.split(':')[-1] field_number += 1 header_length = len(header.encode('utf-8')) @@ -1635,5 +1619,5 @@ def main(cmd_line=None): template.write(label_content, str(output_subdir / filename) + '.xml') -if __name__ == '__main__': +if __name__ == '__main__': # pragma: no coverage main() diff --git a/test_files/expected/clean_header_field_names_success_2.csv b/test_files/expected/clean_header_field_names_success_2.csv new file mode 100644 index 0000000..6304953 --- /dev/null +++ b/test_files/expected/clean_header_field_names_success_2.csv @@ -0,0 +1,2 @@ +pds_Product_Observational__pds_Identification_Area_1__pds_logical_identifier_1,pds_Product_Observational__pds_Identification_Area_1__pds_version_id_1,pds_Product_Observational__pds_Identification_Area_1__pds_title_1,pds_Product_Observational__pds_Identification_Area_1__pds_information_model_version_1,pds_Product_Observational__pds_Observing_System_1__pds_name_1,pds_Product_Observational__pds_Observing_System_1__pds_Observing_System_Component_1__pds_name_1,pds_Product_Observational__pds_Observing_System_1__pds_Observing_System_Component_1__pds_type_1,pds_Product_Observational__pds_Observing_System_1__pds_Observing_System_Component_1__pds_Internal_Reference_1__pds_lid_reference_1,pds_Product_Observational__pds_Observing_System_1__pds_Observing_System_Component_1__pds_Internal_Reference_1__pds_reference_type_1 +urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n,1.0,Cassini ISS Image 1455200455n.img,1.11.0.0,Cassini Orbiter Imaging Science Subsystem,Cassini Orbiter,Spacecraft,urn:nasa:pds:context:instrument_host:spacecraft.co,is_instrument_host diff --git a/test_files/expected/index_file_success.csv b/test_files/expected/index_file_success.csv new file mode 100644 index 0000000..90ebe6e --- /dev/null +++ b/test_files/expected/index_file_success.csv @@ -0,0 +1,2 @@ +pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1>,pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1>,pds:Product_Observational/pds:Identification_Area<1>/pds:title<1>,pds:Product_Observational/pds:Identification_Area<1>/pds:information_model_version<1>,pds:Product_Observational/pds:Observing_System<1>/pds:name<1>,pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:name<1>,pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:type<1>,pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:Internal_Reference<1>/pds:lid_reference<1>,pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:Internal_Reference<1>/pds:reference_type<1> +urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n,1.0,Cassini ISS Image 1455200455n.img,1.11.0.0,Cassini Orbiter Imaging Science Subsystem,Cassini Orbiter,Spacecraft,urn:nasa:pds:context:instrument_host:spacecraft.co,is_instrument_host diff --git a/test_files/expected/label_success_1.csv b/test_files/expected/label_success_1.csv new file mode 100644 index 0000000..90ebe6e --- /dev/null +++ b/test_files/expected/label_success_1.csv @@ -0,0 +1,2 @@ +pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1>,pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1>,pds:Product_Observational/pds:Identification_Area<1>/pds:title<1>,pds:Product_Observational/pds:Identification_Area<1>/pds:information_model_version<1>,pds:Product_Observational/pds:Observing_System<1>/pds:name<1>,pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:name<1>,pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:type<1>,pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:Internal_Reference<1>/pds:lid_reference<1>,pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:Internal_Reference<1>/pds:reference_type<1> +urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n,1.0,Cassini ISS Image 1455200455n.img,1.11.0.0,Cassini Orbiter Imaging Science Subsystem,Cassini Orbiter,Spacecraft,urn:nasa:pds:context:instrument_host:spacecraft.co,is_instrument_host diff --git a/test_files/expected/label_success_1.xml b/test_files/expected/label_success_1.xml new file mode 100644 index 0000000..6d4945f --- /dev/null +++ b/test_files/expected/label_success_1.xml @@ -0,0 +1,116 @@ + + + + + + urn:nasa:pds:rms_metadata:document_opus:generated_label_1 + 1.1 + Index File + 1.21.0.0 + Product_Ancillary + + Creative Common Public License CC0 1.0 (2024) + Creative Commons Zero (CC0) license information. + + urn:nasa:pds:system_bundle:document_pds4_standards:creative_commons_1.0.0::1.0 + product_to_license + + + + + + + + generated_label_1.csv + index-table + 00:00:00 + a177a1160bf3780c01e3bd9e02be89f4 + + +
+ 0 + 819 + UTF-8 Text + Provides the column headers, separated by commas, for the data table. +
+ + 0 + 1058 + PDS DSV 1 + 2 + Line-Feed + Comma + + 9 + 0 + 818 + + pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1> + 1 + ASCII_Short_String_Collapsed + 52 + + + + pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1> + 2 + ASCII_Short_String_Collapsed + 3 + + + + pds:Product_Observational/pds:Identification_Area<1>/pds:title<1> + 3 + ASCII_Short_String_Collapsed + 33 + + + + pds:Product_Observational/pds:Identification_Area<1>/pds:information_model_version<1> + 4 + ASCII_Short_String_Collapsed + 8 + + + + pds:Product_Observational/pds:Observing_System<1>/pds:name<1> + 5 + UTF8_Short_String_Collapsed + 41 + + + + pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:name<1> + 6 + UTF8_Short_String_Collapsed + 15 + + + + pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:type<1> + 7 + ASCII_Short_String_Collapsed + 10 + + + + pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:Internal_Reference<1>/pds:lid_reference<1> + 8 + ASCII_LID + 50 + + + + pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:Internal_Reference<1>/pds:reference_type<1> + 9 + ASCII_Short_String_Collapsed + 18 + + + + +
+
diff --git a/test_files/expected/label_success_2.csv b/test_files/expected/label_success_2.csv new file mode 100644 index 0000000..d9f2dc4 --- /dev/null +++ b/test_files/expected/label_success_2.csv @@ -0,0 +1,2 @@ +pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1>,pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1>,pds:Product_Observational/pds:Identification_Area<1>/pds:title<1>,pds:Product_Observational/pds:Identification_Area<1>/pds:information_model_version<1>,pds:Product_Observational/pds:Observing_System<1>/pds:name<1>,pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:name<1>,pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:type<1>,pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:Internal_Reference<1>/pds:lid_reference<1>,pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:Internal_Reference<1>/pds:reference_type<1> +urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n ,1.0 ,Cassini ISS Image 1455200455n.img ,1.11.0.0 ,Cassini Orbiter Imaging Science Subsystem ,Cassini Orbiter ,Spacecraft ,urn:nasa:pds:context:instrument_host:spacecraft.co ,is_instrument_host diff --git a/test_files/expected/label_success_2.xml b/test_files/expected/label_success_2.xml new file mode 100644 index 0000000..b5bed6e --- /dev/null +++ b/test_files/expected/label_success_2.xml @@ -0,0 +1,115 @@ + + + + + + urn:nasa:pds:rms_metadata:document_opus:generated_label_2 + 1.1 + Index File + 1.21.0.0 + Product_Ancillary + + Creative Common Public License CC0 1.0 (2024) + Creative Commons Zero (CC0) license information. + + urn:nasa:pds:system_bundle:document_pds4_standards:creative_commons_1.0.0::1.0 + product_to_license + + + + + + + + generated_label_2.csv + index-table + 00:00:00 + 53d47b320936ac3fbba0852696065418 + + +
+ 0 + 819 + UTF-8 Text + Provides the column headers, separated by commas, for the data table. +
+ + + 1638 + 2 + Line-Feed + + + 9 + 0 + + + pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1> + 1 + 1 + ASCII_Short_String_Collapsed + 78 + + + pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1> + 2 + 79 + ASCII_Short_String_Collapsed + 70 + + + pds:Product_Observational/pds:Identification_Area<1>/pds:title<1> + 3 + 150 + ASCII_Short_String_Collapsed + 65 + + + pds:Product_Observational/pds:Identification_Area<1>/pds:information_model_version<1> + 4 + 216 + ASCII_Short_String_Collapsed + 85 + + + pds:Product_Observational/pds:Observing_System<1>/pds:name<1> + 5 + 302 + UTF8_Short_String_Collapsed + 61 + + + pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:name<1> + 6 + 364 + UTF8_Short_String_Collapsed + 95 + + + pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:type<1> + 7 + 460 + ASCII_Short_String_Collapsed + 95 + + + pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:Internal_Reference<1>/pds:lid_reference<1> + 8 + 556 + ASCII_LID + 130 + + + pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:Internal_Reference<1>/pds:reference_type<1> + 9 + 687 + ASCII_Short_String_Collapsed + 131 + + + +
+
diff --git a/test_files/expected/label_success_3.csv b/test_files/expected/label_success_3.csv new file mode 100644 index 0000000..188177b --- /dev/null +++ b/test_files/expected/label_success_3.csv @@ -0,0 +1,4 @@ +filename,filepath,lid,bundle,bundle_lid,pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1>,pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1> +tester_label_1.xml,labels/tester_label_1.xml,urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n,cassini_iss_saturn,urn:nasa:pds:cassini_iss_saturn,urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n,1.0 +tester_label_2.xml,labels/tester_label_2.xml,urn:nasa:pds:uranus_occ_u149_irtf_320cm:data:2200nm_counts-v-time_occult,uranus_occ_u149_irtf_320cm,urn:nasa:pds:uranus_occ_u149_irtf_320cm,urn:nasa:pds:uranus_occ_u149_irtf_320cm:data:2200nm_counts-v-time_occult,1.0 +tester_label_3.xml,labels/tester_label_3.xml,urn:nasa:pds:cassini_iss_cruise:data_raw:1357539630n,cassini_iss_cruise,urn:nasa:pds:cassini_iss_cruise,urn:nasa:pds:cassini_iss_cruise:data_raw:1357539630n,1.0 diff --git a/test_files/expected/label_success_3.xml b/test_files/expected/label_success_3.xml new file mode 100644 index 0000000..96cc903 --- /dev/null +++ b/test_files/expected/label_success_3.xml @@ -0,0 +1,102 @@ + + + + + + urn:nasa:pds:rms_metadata:document_opus:generated_label_3 + 1.1 + Index File + 1.21.0.0 + Product_Ancillary + + Creative Common Public License CC0 1.0 (2024) + Creative Commons Zero (CC0) license information. + + urn:nasa:pds:system_bundle:document_pds4_standards:creative_commons_1.0.0::1.0 + product_to_license + + + + + + + + generated_label_3.csv + index-table + 00:00:00 + 8b2eb69a284938d23748de7f53d2e45b + + +
+ 0 + 190 + UTF-8 Text + Provides the column headers, separated by commas, for the data table. +
+ + 0 + 864 + PDS DSV 1 + 4 + Line-Feed + Comma + + 7 + 0 + 261 + + filename + 1 + ASCII_File_Name + 18 + + + + filepath + 2 + ASCII_File_Specification_Name + 25 + + + + lid + 3 + ASCII_LID + 72 + + + + bundle + 4 + ASCII_Text_Preserved + 26 + + + + bundle_lid + 5 + ASCII_LID + 39 + + + + pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1> + 6 + ASCII_Short_String_Collapsed + 72 + + + + pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1> + 7 + ASCII_Short_String_Collapsed + 3 + + + + +
+
diff --git a/test_files/expected/limit_xpaths_file_success_1.csv b/test_files/expected/limit_xpaths_file_success_1.csv new file mode 100644 index 0000000..08b5633 --- /dev/null +++ b/test_files/expected/limit_xpaths_file_success_1.csv @@ -0,0 +1,2 @@ +pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1>,pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1>,pds:Product_Observational/pds:Identification_Area<1>/pds:title<1> +urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n,1.0,Cassini ISS Image 1455200455n.img diff --git a/test_files/expected/simplify_xpaths_success_1.txt b/test_files/expected/simplify_xpaths_success_1.txt index e179908..d39804c 100644 --- a/test_files/expected/simplify_xpaths_success_1.txt +++ b/test_files/expected/simplify_xpaths_success_1.txt @@ -1,9 +1,9 @@ -pds:logical_identifier -pds:version_id -pds:title -pds:information_model_version +pds:logical_identifier<1> +pds:version_id<1> +pds:title<1> +pds:information_model_version<1> pds:Product_Observational/pds:Observing_System<1>/pds:name<1> pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:name<1> -pds:type -pds:lid_reference -pds:reference_type +pds:type<1> +pds:lid_reference<1> +pds:reference_type<1> diff --git a/test_files/expected/simplify_xpaths_success_3.txt b/test_files/expected/simplify_xpaths_success_3.txt index 08fc585..0ac5ded 100644 --- a/test_files/expected/simplify_xpaths_success_3.txt +++ b/test_files/expected/simplify_xpaths_success_3.txt @@ -1,31 +1,31 @@ -rings:occultation_type -rings:occultation_direction -rings:time_series_direction -rings:planetary_occultation_flag -rings:data_quality_score -rings:ring_plane -rings:star_name -rings:fresnel_scale -rings:projected_star_diameter -rings:sigma_projected_star_diameter -rings:fractional_error_star_counts -rings:time_constant_type -rings:time_constant -rings:sigma_time_constant -rings:minimum_wavelength -rings:maximum_wavelength -rings:reference_time_utc -rings:minimum_observed_event_time -rings:maximum_observed_event_time -rings:observed_event_start_tdb -rings:observed_event_stop_tdb -rings:earth_received_start_time_utc -rings:earth_received_stop_time_utc -rings:minimum_ring_radius -rings:maximum_ring_radius -rings:minimum_ring_longitude -rings:maximum_ring_longitude -rings:minimum_observed_ring_azimuth -rings:maximum_observed_ring_azimuth -rings:observed_ring_elevation -rings:light_source_incidence_angle +rings:occultation_type<1> +rings:occultation_direction<1> +rings:time_series_direction<1> +rings:planetary_occultation_flag<1> +rings:data_quality_score<1> +rings:ring_plane<1> +rings:star_name<1> +rings:fresnel_scale<1> +rings:projected_star_diameter<1> +rings:sigma_projected_star_diameter<1> +rings:fractional_error_star_counts<1> +rings:time_constant_type<1> +rings:time_constant<1> +rings:sigma_time_constant<1> +rings:minimum_wavelength<1> +rings:maximum_wavelength<1> +rings:reference_time_utc<1> +rings:minimum_observed_event_time<1> +rings:maximum_observed_event_time<1> +rings:observed_event_start_tdb<1> +rings:observed_event_stop_tdb<1> +rings:earth_received_start_time_utc<1> +rings:earth_received_stop_time_utc<1> +rings:minimum_ring_radius<1> +rings:maximum_ring_radius<1> +rings:minimum_ring_longitude<1> +rings:maximum_ring_longitude<1> +rings:minimum_observed_ring_azimuth<1> +rings:maximum_observed_ring_azimuth<1> +rings:observed_ring_elevation<1> +rings:light_source_incidence_angle<1> diff --git a/test_files/expected/simplify_xpaths_success_4.txt b/test_files/expected/simplify_xpaths_success_4.txt index a1d7d69..9422e08 100644 --- a/test_files/expected/simplify_xpaths_success_4.txt +++ b/test_files/expected/simplify_xpaths_success_4.txt @@ -1,64 +1,64 @@ -cassini:mission_phase_name -cassini:spacecraft_clock_count_partition -cassini:spacecraft_clock_start_count -cassini:spacecraft_clock_stop_count -cassini:limitations -cassini:antiblooming_state_flag -cassini:command_file_name -cassini:command_sequence_number -cassini:dark_strip_mean -cassini:data_conversion_type -cassini:delayed_readout_flag -cassini:detector_temperature -cassini:electronics_bias -cassini:expected_maximum_full_well -cassini:expected_maximum_DN_sat -cassini:expected_packets -cassini:exposure_duration -cassini:filter_name_1 -cassini:filter_name_2 -cassini:filter_temperature -cassini:flight_software_version_id -cassini:gain_mode_id -cassini:ground_software_version_id -cassini:image_mid_time -cassini:image_number -cassini:image_time -cassini:image_observation_type -cassini:instrument_data_rate -cassini:inst_cmprs_type -cassini:inst_cmprs_param_malgo -cassini:inst_cmprs_param_tb -cassini:inst_cmprs_param_blocks -cassini:inst_cmprs_param_quant -cassini:inst_cmprs_rate_expected_bits -cassini:inst_cmprs_rate_actual_bits -cassini:inst_cmprs_ratio -cassini:light_flood_state_flag -cassini:method_description -cassini:missing_lines -cassini:missing_packet_flag -cassini:optics_temperature_front -cassini:optics_temperature_back -cassini:order_number -cassini:parallel_clock_voltage_index -cassini:pds3_product_creation_time -cassini:pds3_product_version_type -cassini:pds3_target_desc -cassini:pds3_target_list -cassini:pds3_target_name -cassini:pre-pds_version_number -cassini:prepare_cycle_index -cassini:readout_cycle_index -cassini:received_packets -cassini:sensor_head_electronics_temperature -cassini:sequence_id -cassini:sequence_number -cassini:sequence_title -cassini:shutter_mode_id -cassini:shutter_state_id -cassini:start_time_doy -cassini:stop_time_doy -cassini:telemetry_format_id -cassini:valid_maximum_full_well -cassini:valid_maximum_DN_sat +cassini:mission_phase_name<1> +cassini:spacecraft_clock_count_partition<1> +cassini:spacecraft_clock_start_count<1> +cassini:spacecraft_clock_stop_count<1> +cassini:limitations<1> +cassini:antiblooming_state_flag<1> +cassini:command_file_name<1> +cassini:command_sequence_number<1> +cassini:dark_strip_mean<1> +cassini:data_conversion_type<1> +cassini:delayed_readout_flag<1> +cassini:detector_temperature<1> +cassini:electronics_bias<1> +cassini:expected_maximum_full_well<1> +cassini:expected_maximum_DN_sat<1> +cassini:expected_packets<1> +cassini:exposure_duration<1> +cassini:filter_name_1<1> +cassini:filter_name_2<1> +cassini:filter_temperature<1> +cassini:flight_software_version_id<1> +cassini:gain_mode_id<1> +cassini:ground_software_version_id<1> +cassini:image_mid_time<1> +cassini:image_number<1> +cassini:image_time<1> +cassini:image_observation_type<1> +cassini:instrument_data_rate<1> +cassini:inst_cmprs_type<1> +cassini:inst_cmprs_param_malgo<1> +cassini:inst_cmprs_param_tb<1> +cassini:inst_cmprs_param_blocks<1> +cassini:inst_cmprs_param_quant<1> +cassini:inst_cmprs_rate_expected_bits<1> +cassini:inst_cmprs_rate_actual_bits<1> +cassini:inst_cmprs_ratio<1> +cassini:light_flood_state_flag<1> +cassini:method_description<1> +cassini:missing_lines<1> +cassini:missing_packet_flag<1> +cassini:optics_temperature_front<1> +cassini:optics_temperature_back<1> +cassini:order_number<1> +cassini:parallel_clock_voltage_index<1> +cassini:pds3_product_creation_time<1> +cassini:pds3_product_version_type<1> +cassini:pds3_target_desc<1> +cassini:pds3_target_list<1> +cassini:pds3_target_name<1> +cassini:pre-pds_version_number<1> +cassini:prepare_cycle_index<1> +cassini:readout_cycle_index<1> +cassini:received_packets<1> +cassini:sensor_head_electronics_temperature<1> +cassini:sequence_id<1> +cassini:sequence_number<1> +cassini:sequence_title<1> +cassini:shutter_mode_id<1> +cassini:shutter_state_id<1> +cassini:start_time_doy<1> +cassini:stop_time_doy<1> +cassini:telemetry_format_id<1> +cassini:valid_maximum_full_well<1> +cassini:valid_maximum_DN_sat<1> diff --git a/test_files/expected/tester_config.yaml b/test_files/expected/tester_config.yaml index 3ff9bc6..35d3d8c 100644 --- a/test_files/expected/tester_config.yaml +++ b/test_files/expected/tester_config.yaml @@ -18,4 +18,6 @@ nillable: anticipated: anticipated_alt label-contents: - version_id: 1.1 \ No newline at end of file + version_id: 1.1 + File: + creation_date_time: '00:00:00' diff --git a/test_files/expected/tester_config_label.yaml b/test_files/expected/tester_config_label.yaml new file mode 100644 index 0000000..ada75dc --- /dev/null +++ b/test_files/expected/tester_config_label.yaml @@ -0,0 +1,13 @@ + +label-contents: + title: Index file for my occultation bundle + Modification_Detail: + - modification_date: '2024-01-01' + version_id: 1.1 + description: | + This is a lengthy description of what this modification + changed in the bundle. + There were lots of changes. + - modification_date: '2023-01-01' + version_id: 1.0 + description: Initial release. diff --git a/test_files/expected/tester_config_nillable.yaml b/test_files/expected/tester_config_nillable.yaml new file mode 100644 index 0000000..4be242b --- /dev/null +++ b/test_files/expected/tester_config_nillable.yaml @@ -0,0 +1,18 @@ +nillable: + pds:ASCII_Integer: + inapplicable: -9999 + missing: -9988 + unknown: -9977 + anticipated: -9966 + + pds:ASCII_Real: + inapplicable: -9999.0 + missing: -9988.0 + unknown: -9977.0 + anticipated: -9966.0 + + pds:ASCII_Short_String_Collapsed: + inapplicable: inapplicable_alt + missing: missing_alt + unknown: unknown_alt + anticipated: anticipated_alt \ No newline at end of file diff --git a/test_files/labels/bad_lid_label.xml b/test_files/labels/bad_lid_label.xml new file mode 100644 index 0000000..b6847a7 --- /dev/null +++ b/test_files/labels/bad_lid_label.xml @@ -0,0 +1,25 @@ + + + + 1.0 + Cassini ISS Image 1455200455n.img + 1.11.0.0 + + + Cassini Orbiter Imaging Science Subsystem + + Cassini Orbiter + Spacecraft + + urn:nasa:pds:context:instrument_host:spacecraft.co + is_instrument_host + + + + diff --git a/test_files/samples/element_extra_file_info.txt b/test_files/samples/element_extra_file_info.txt new file mode 100644 index 0000000..731d690 --- /dev/null +++ b/test_files/samples/element_extra_file_info.txt @@ -0,0 +1,4 @@ +pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1> +pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1> +pds:Product_Observational/pds:Identification_Area<1>/pds:title<1> +!filename \ No newline at end of file diff --git a/tests/test_pds4_create_xml_index_blackbox.py b/tests/test_pds4_create_xml_index_blackbox.py index 26e772f..f71e7ed 100644 --- a/tests/test_pds4_create_xml_index_blackbox.py +++ b/tests/test_pds4_create_xml_index_blackbox.py @@ -2,6 +2,7 @@ import pytest import os import tempfile +import shutil import pds4indextools.pds4_create_xml_index as tools @@ -15,21 +16,67 @@ @pytest.mark.parametrize( - 'golden_file,new_file,cmd_line', + 'golden_file,new_file_index,new_file_headers,cmd_line', [ + #Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" + ( + str(expected_dir / 'index_file_success.csv'), + None, None, + [] + ), + + #Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary + ( + str(expected_dir / 'index_file_success.csv'), + None, None, + [ + '--generate-label', + 'ancillary' + ] + ), + + # Testing --limit-xpaths-file with two outputs + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --output-headers-file limit_xpaths_file.txt --output-index-file limit_xpaths_file.csv + # Compare result to golden copy: + # test_files/expected/limit_xpaths_file_success_1.txt + ( + str(expected_dir / 'limit_xpaths_file_success_1.csv'), + 'limit_xpaths_file.csv', 'limit_xpaths_file.txt', + [ + str(test_files_dir), + str(labels_dir.name / Path('tester_label_1.xml')), + '--limit-xpaths-file', + str(samples_dir / 'element_1.txt') + ] + ), + # Testing --limit-xpaths-file # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --output-headers-file limit_xpaths_file.txt # Compare result to golden copy: # test_files/expected/limit_xpaths_file_success_1.txt ( str(expected_dir / 'limit_xpaths_file_success_1.txt'), - 'limit_xpaths_file.txt', + None, 'limit_xpaths_file.txt', [ str(test_files_dir), str(labels_dir.name / Path('tester_label_1.xml')), '--limit-xpaths-file', - str(samples_dir / 'element_1.txt'), - '--output-headers-file' + str(samples_dir / 'element_1.txt') + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --output-headers-file limit_xpaths_file.txt + # Compare result to golden copy: + # test_files/expected/limit_xpaths_file_success_1.txt + ( + str(expected_dir / 'limit_xpaths_file_success_1.txt'), + None, 'limit_xpaths_file_wack.txt', + [ + str(test_files_dir), + str(labels_dir.name / Path('tester_label_1.xml')), + str(labels_dir.name / Path('nonexistent.xml')), + '--limit-xpaths-file', + str(samples_dir / 'element_1.txt') ] ), @@ -38,13 +85,12 @@ # test_files/expected/limit_xpaths_file_success_2.txt ( str(expected_dir / 'limit_xpaths_file_success_2.txt'), - 'limit_xpaths_file_2.txt', + None, 'limit_xpaths_file_2.txt', [ str(test_files_dir), str(labels_dir.name / Path('tester_label_2.xml')), '--limit-xpaths-file', - str(samples_dir / 'element_2.txt'), - '--output-headers-file', + str(samples_dir / 'element_2.txt') ] ), @@ -53,13 +99,12 @@ # test_files/expected/limit_xpaths_file_success_2.txt ( str(expected_dir / 'limit_xpaths_file_success_2.txt'), - 'elements_dupe_file_2.txt', + None, 'elements_dupe_file_2.txt', [ str(test_files_dir), str(labels_dir.name / Path('tester_label_2.xml')), '--limit-xpaths-file', - str(samples_dir / 'element_duplicates.txt'), - '--output-headers-file', + str(samples_dir / 'element_duplicates.txt') ] ), @@ -68,14 +113,13 @@ # test_files/expected/limit_xpaths_file_success_3.txt ( str(expected_dir / 'limit_xpaths_file_success_3.txt'), - 'limit_xpaths_file_3.txt', + None, 'limit_xpaths_file_3.txt', [ str(test_files_dir), str(labels_dir.name / Path('tester_label_2.xml')), str(labels_dir.name / Path('tester_label_3.xml')), '--limit-xpaths-file', - str(samples_dir / 'element_3.txt'), - '--output-headers-file', + str(samples_dir / 'element_3.txt') ] ), @@ -84,15 +128,14 @@ # test_files/expected/limit_xpaths_file_success_4.txt ( str(expected_dir / 'limit_xpaths_file_success_4.txt'), - 'limit_xpaths_file_4.txt', + None, 'limit_xpaths_file_4.txt', [ str(test_files_dir), str(labels_dir.name / Path('tester_label_1.xml')), str(labels_dir.name / Path('tester_label_2.xml')), str(labels_dir.name / Path('tester_label_3.xml')), '--limit-xpaths-file', - str(samples_dir / 'element_4.txt'), - '--output-headers-file', + str(samples_dir / 'element_4.txt') ] ), @@ -102,12 +145,11 @@ # test_files/expected/simplify_xpaths_success_1.txt ( str(expected_dir / 'simplify_xpaths_success_1.txt'), - 'simplify_xpaths_1.txt', + None, 'simplify_xpaths_1.txt', [ str(test_files_dir), str(labels_dir.name / Path('tester_label_1.xml')), - '--simplify-xpaths', - '--output-headers-file', + '--simplify-xpaths' ] ), @@ -117,7 +159,7 @@ # test_files/expected/simplify_xpaths_success_2.txt ( str(expected_dir / 'simplify_xpaths_success_2.txt'), - 'simplify_xpaths_2.txt', + None, 'simplify_xpaths_2.txt', [ str(test_files_dir), str(labels_dir.name / Path('tester_label_1.xml')), @@ -125,8 +167,7 @@ str(labels_dir.name / Path('tester_label_3.xml')), '--simplify-xpaths', '--limit-xpaths-file', - str(samples_dir / 'elements_xpath_simplify_2.txt'), - '--output-headers-file', + str(samples_dir / 'elements_xpath_simplify_2.txt') ] ), @@ -136,14 +177,13 @@ # test_files/expected/simplify_xpaths_success_3.txt ( str(expected_dir / 'simplify_xpaths_success_3.txt'), - 'simplify_xpaths_3.txt', + None, 'simplify_xpaths_3.txt', [ str(test_files_dir), str(labels_dir.name / Path('tester_label_2.xml')), '--simplify-xpaths', '--limit-xpaths-file', - str(samples_dir / 'elements_xpath_simplify_3.txt'), - '--output-headers-file', + str(samples_dir / 'elements_xpath_simplify_3.txt') ] ), @@ -153,14 +193,13 @@ # test_files/expected/simplify_xpaths_success_4.txt ( str(expected_dir / 'simplify_xpaths_success_4.txt'), - 'simplify_xpaths_4.txt', + None, 'simplify_xpaths_4.txt', [ str(test_files_dir), str(labels_dir.name / Path('tester_label_3.xml')), '--simplify-xpaths', '--limit-xpaths-file', - str(samples_dir / 'elements_xpath_simplify_4.txt'), - '--output-headers-file', + str(samples_dir / 'elements_xpath_simplify_4.txt') ] ), @@ -170,15 +209,14 @@ # test_files/expected/extra_file_info_success_1.csv ( str(expected_dir / 'extra_file_info_success_1.csv'), - 'extra_file_info_1.csv', + 'extra_file_info_1.csv', None, [ str(test_files_dir), str(labels_dir.name / Path('tester_label_2.xml')), '--limit-xpaths-file', - str(samples_dir / 'element_1.txt'), + str(samples_dir / 'element_extra_file_info.txt'), '--add-extra-file-info', 'filename,filepath', - '--output-index-file', ] ), @@ -188,7 +226,7 @@ # test_files/expected/extra_file_info_success_2.csv ( str(expected_dir / 'extra_file_info_success_2.csv'), - 'extra_file_info_2.csv', + 'extra_file_info_2.csv', None, [ str(test_files_dir), str(labels_dir.name / Path('tester_label_1.xml')), @@ -199,8 +237,7 @@ '--add-extra-file-info', 'filename', '--sort-by', - 'filename', - '--output-index-file', + 'filename' ] ), @@ -209,7 +246,7 @@ # test_files/expected/extra_file_info_success_3.csv ( str(expected_dir / 'extra_file_info_success_3.csv'), - 'extra_file_info_3.csv', + 'extra_file_info_3.csv', None, [ str(test_files_dir), str(labels_dir.name / Path('tester_label_1.xml')), @@ -220,8 +257,7 @@ '--add-extra-file-info', 'filename,filepath,lid,bundle,bundle_lid', '--sort-by', - 'filename', - '--output-index-file', + 'filename' ] ), @@ -231,12 +267,24 @@ # test_files/expected/clean_header_field_names_success_1.txt ( str(expected_dir / 'clean_header_field_names_success_1.txt'), - 'clean_header_field_names_1.txt', + None, 'clean_header_field_names_1.txt', [ str(test_files_dir), str(labels_dir.name / Path('tester_label_1.xml')), - '--clean-header-field-names', - '--output-headers-file', + '--clean-header-field-names' + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/elements_clean_header_field_names.txt --clean-header-field-names --output-headers-file clean_header_field_names_2.txt + # Compare result to golden copy: + # test_files/expected/clean_header_field_names_success_2.txt + ( + str(expected_dir / 'clean_header_field_names_success_2.csv'), + 'clean_header_field_names_2.csv', None, + [ + str(test_files_dir), + str(labels_dir.name / Path('tester_label_1.xml')), + '--clean-header-field-names' ] ), @@ -245,15 +293,14 @@ # test_files/expected/clean_header_field_names_success_2.txt ( str(expected_dir / 'clean_header_field_names_success_2.txt'), - 'clean_header_field_names_2.txt', + None, 'clean_header_field_names_2.txt', [ str(test_files_dir), str(labels_dir.name / Path('tester_label_1.xml')), str(labels_dir.name / Path('tester_label_2.xml')), '--limit-xpaths-file', str(samples_dir / 'elements_clean_header_field_names.txt'), - '--clean-header-field-names', - '--output-headers-file', + '--clean-header-field-names' ] ), @@ -263,7 +310,7 @@ # test_files/expected/sort_by_success_1.csv ( str(expected_dir / 'sort_by_success_1.csv'), - 'sort_by_1.csv', + 'sort_by_1.csv', None, [ str(test_files_dir), str(labels_dir.name / Path('tester_label_1.xml')), @@ -273,8 +320,7 @@ str(samples_dir / 'elements_clean_header_field_names.txt'), '--sort-by', 'pds:Product_Observational/pds:Identification_Area<1>/' - 'pds:logical_identifier<1>', - '--output-index-file', + 'pds:logical_identifier<1>' ] ), @@ -283,7 +329,7 @@ # test_files/expected/sort_by_success_2.csv ( str(expected_dir / 'sort_by_success_2.csv'), - 'sort_by_2.csv', + 'sort_by_2.csv', None, [ str(test_files_dir), str(labels_dir.name / Path('tester_label_1.xml')), @@ -294,8 +340,7 @@ '--add-extra-file-info', 'bundle_lid,filepath', '--sort-by', - 'bundle_lid', - '--output-index-file', + 'bundle_lid' ] ), @@ -304,7 +349,7 @@ # test_files/expected/identical_labels_success.csv ( str(expected_dir / 'identical_labels_success.csv'), - 'identical_labels.csv', + 'identical_labels.csv', None, [ str(test_files_dir), str(labels_dir.name / Path('identical_label_*.xml')), @@ -313,8 +358,7 @@ '--add-extra-file-info', 'filename', '--sort-by', - 'filename', - '--output-index-file' + 'filename' ] ), @@ -323,13 +367,12 @@ # test_files/expected/nilled_element_success.csv ( str(expected_dir / 'nilled_element_success.csv'), - 'nilled_element.csv', + 'nilled_element.csv', None, [ str(test_files_dir), str(labels_dir.name / Path('nilled_label.xml')), '--limit-xpaths-file', - str(samples_dir / 'elements_nilled.txt'), - '--output-index-file' + str(samples_dir / 'elements_nilled.txt') ] ), @@ -338,38 +381,154 @@ # test_files/expected/fixed_width_success.csv ( str(expected_dir / 'fixed_width_success.csv'), - 'fixed_width.csv', + 'fixed_width.csv', None, + [ + str(test_files_dir), + str(labels_dir.name / Path('tester_label_1.xml')), + '--fixed-width' + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary --config ../test_files/expected/tester_config.yaml --output-index-file generated_label_1.csv + # Compare result to golden copy: + # test_files/expected/label_success_1.csv + # test_files/expected/label_success_1.xml + ( + str(expected_dir / 'label_success_1.csv'), + 'generated_label_1.csv', None, + [ + str(test_files_dir), + str(labels_dir.name / Path('tester_label_1.xml')), + '--generate-label', + 'ancillary', + '--config', + str(expected_dir / 'tester_config.yaml') + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label metadata --fixed-width --output-index-file generated_label_2.csv --config ../test_files/expected/tester_config.yaml --output-index-file generated_label_2.csv + # Compare result to golden copy: + # test_files/expected/label_success_2.csv + # test_files/expected/label_success_2.xml + ( + str(expected_dir / 'label_success_2.csv'), + 'generated_label_2.csv', None, [ str(test_files_dir), str(labels_dir.name / Path('tester_label_1.xml')), + '--generate-label', + 'metadata', '--fixed-width', - '--output-index-file' + '--config', + str(expected_dir / 'tester_config.yaml') + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_5.txt --add-extra-file-info filename,filepath,lid,bundle,bundle_lid --generate-label ancillary --config ../test_files/expected/tester_config.yaml --output-index-file generated_label_3.csv + # Compare result to golden copy: + # test_files/expected/label_success_3.csv + # test_files/expected/label_success_3.xml + ( + str(expected_dir / 'label_success_3.csv'), + 'generated_label_3.csv', None, + [ + str(test_files_dir), + str(labels_dir.name / Path('tester_label_1.xml')), + str(labels_dir.name / Path('tester_label_2.xml')), + str(labels_dir.name / Path('tester_label_3.xml')), + '--limit-xpaths-file', + str(samples_dir / 'element_5.txt'), + '--add-extra-file-info', + 'filename,filepath,lid,bundle,bundle_lid', + '--sort-by', + 'filename', + '--generate-label', + 'ancillary', + '--config', + str(expected_dir / 'tester_config.yaml') ] ) ] ) -def test_success(golden_file, new_file, cmd_line): +def test_success(golden_file, new_file_index, new_file_headers, cmd_line): # Create a temporary directory with tempfile.TemporaryDirectory(dir=test_files_dir.parent) as temp_dir: temp_dir_path = Path(temp_dir) - # THE PATH TO THE NEW FILE - path_to_file = temp_dir_path / new_file - # Call main() function with the simulated command line arguments - cmd_line.append(str(path_to_file)) - tools.main(cmd_line) + if new_file_index == None and new_file_headers == None: + os.chdir(temp_dir_path) + cmd_line.append(str(test_files_dir)) + cmd_line.append(str(labels_dir.name / Path('tester_label_1.xml'))) + # Call main() function with the simulated command line arguments + tools.main(cmd_line) + + path_to_file = temp_dir_path / 'index.csv' + # Assert that the file now exists + assert os.path.isfile(path_to_file) + + # Open and compare the two files + with open(path_to_file, 'rb') as created: + formed = created.read() + + with open(golden_file, 'rb') as new: + expected = new.read() + + assert formed == expected + os.remove(path_to_file) + os.chdir(ROOT_DIR) + + else: + # THE PATH TO THE NEW FILE + if new_file_index: + path_to_file = temp_dir_path / new_file_index + cmd_line.append('--output-index-file') + cmd_line.append(str(path_to_file)) + # Call main() function with the simulated command line arguments + tools.main(cmd_line) + # Assert that the file now exists + assert os.path.isfile(path_to_file) + + # Open and compare the two files + with open(path_to_file, 'rb') as created: + formed = created.read() + + with open(golden_file, 'rb') as new: + expected = new.read() + + assert formed == expected - # Assert that the file now exists - assert os.path.isfile(path_to_file) + if '--generate-label' in cmd_line: + label_path = str(path_to_file).replace('.csv', '.xml') + golden_label = str(golden_file).replace('.csv', '.xml') + assert os.path.isfile(label_path) - # Open and compare the two files - with open(path_to_file, 'rb') as created: - formed = created.read() + # Open and compare the two files + with open(label_path, 'rb') as created: + formed = created.read() - with open(golden_file, 'rb') as new: - expected = new.read() + with open(golden_label, 'rb') as new: + expected = new.read() - assert formed == expected + assert formed == expected + + if new_file_headers: + path_to_file = temp_dir_path / new_file_headers + golden_file = str(golden_file).replace('.csv', '.txt') + cmd_line.append('--output-headers-file') + cmd_line.append(str(path_to_file)) + # Call main() function with the simulated command line arguments + tools.main(cmd_line) + # Assert that the file now exists + assert os.path.isfile(path_to_file) + + # Open and compare the two files + with open(path_to_file, 'rb') as created: + formed = created.read() + + with open(golden_file, 'rb') as new: + expected = new.read() + + assert formed == expected @pytest.mark.parametrize( @@ -386,6 +545,7 @@ def test_success(golden_file, new_file, cmd_line): '--add-extra-file-info', 'bad_element', '--output-headers-file', + 'hdout.txt' ), # Executable command: pds4_create_xml_index ../test_files/labels "bad_directory/labels/tester_label_*.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --add-extra-file-info filename --output-headers-file hdout.txt @@ -397,6 +557,7 @@ def test_success(golden_file, new_file, cmd_line): '--add-extra-file-info', # extra file info 'filename', '--output-headers-file', + 'hdout.txt' ), # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_empty.txt --output-headers-file hdout.txt @@ -408,7 +569,56 @@ def test_success(golden_file, new_file, cmd_line): '--limit-xpaths-file', str(samples_dir / 'element_empty.txt'), # empty elements file '--output-headers-file', + 'hdout.txt' + ), + + #Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --simplify-xpaths --sort-by bad_sort --output-headers-file hdout.csv + ( + str(test_files_dir), + str(labels_dir.name / Path('tester_label_1.xml')), + '--simplify-xpaths', + '--sort-by', + 'bad_sort', + '--output-index-file', + 'hdout.csv' + ), + + #Executable command: pds4_create_xml_index ../test_files/labels "nonexistent.xml" --output-headers-file hdout.txt + ( + str(test_files_dir), + str(labels_dir.name / Path('nonexistent.xml')), + '--output-headers-file', + 'hdout.txt', + ), + + #Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/elements_xpath_simplify_3.txt --output-headers-file hdout.txt + ( + str(test_files_dir), + str(labels_dir.name / Path('tester_label_1.xml')), + '--limit-xpaths-file', + str(samples_dir / 'elements_xpath_simplify_3.txt'), + '--output-headers-file', + 'hdout.txt', + ), + + #Executable command: pds4_create_xml_index ../test_files/labels "tester_label_*.xml" --generate-label ancillary --output-headers-file hdout.txt + ( + str(test_files_dir), + str(labels_dir.name / Path('tester_label_*.xml')), + '--generate-label', + 'ancillary', + '--output-headers-file', + 'hdout.txt', + ), + + #Executable command: pds4_create_xml_index ../test_files/labels "bad_lid_label.xml" --output-headers-file hdout.txt + ( + str(test_files_dir), + str(labels_dir.name / Path('bad_lid_label.xml')), + '--output-headers-file', + 'hdout.txt', ) + ] ) def test_failures(cmd_line): @@ -417,7 +627,8 @@ def test_failures(cmd_line): tools.main(cmd_line) assert e.type == SystemExit assert e.value.code != 0 # Check that the exit code indicates failure - + if os.path.isfile('hdout.txt'): + os.remove('hdout.txt') @pytest.mark.parametrize( 'new_file,cmd_line', @@ -454,3 +665,7 @@ def test_failure_message(capfd, new_file, cmd_line): expected_message = ("Non-nillable element in") assert expected_message in captured.out or expected_message in captured.err + +def test_invalid_arguments(): + with pytest.raises(SystemExit): # Assuming argparse will call sys.exit on failure + tools.main(["--invalid-option"]) diff --git a/tests/test_pds4_create_xml_index_whitebox.py b/tests/test_pds4_create_xml_index_whitebox.py index c746b73..cf9f9d2 100644 --- a/tests/test_pds4_create_xml_index_whitebox.py +++ b/tests/test_pds4_create_xml_index_whitebox.py @@ -1,3 +1,4 @@ +import argparse from datetime import datetime from lxml import etree import os @@ -64,7 +65,7 @@ def test_load_config_object(): # Tests that the config_object is loaded over. config_object = tools.load_config_file( - specified_config_files=[str(expected_dir/'tester_config.yaml'),]) + specified_config_files=[str(expected_dir/'tester_config_nillable.yaml'),]) assert config_object['nillable']['pds:ASCII_Date_YMD']['inapplicable'] == '0001-01-01' assert config_object['nillable']['pds:ASCII_Date_YMD']['missing'] == '0002-01-01' @@ -90,6 +91,13 @@ def test_load_config_object(): assert (config_object['nillable']['pds:ASCII_Short_String_Collapsed'] ['anticipated'] == 'anticipated_alt') + # Tests specified configuration files wiht one or the other + config_object = tools.load_config_file( + specified_config_files=[str(expected_dir/'tester_config_label.yaml'),]) + + assert config_object['label-contents']['version_id'] == '1.0' + assert config_object['label-contents']['title'] == 'Index file for my occultation bundle' + # A bad default config file with pytest.raises(SystemExit): tools.load_config_file(default_config_file=expected_dir/'non_existent_file.ini') @@ -145,6 +153,9 @@ def test_default_value_for_nil(): '0004-01-01T12:00Z') assert tools.default_value_for_nil(config_object, datetime_ymd_utc, 'anticipated') == '0004-01-01T12:00Z' + + # Testing None + assert tools.default_value_for_nil(config_object, None, 'anticipated') == None def test_default_value_for_nil_ascii_date_time_ymd_utc(): @@ -241,6 +252,16 @@ def test_get_longest_row_length(): result = tools.get_longest_row_length(filename) assert result == 254 + # Failure + with pytest.raises(OSError): + filename = ( + '0eD8s3JGt9RmE5YnVpLZxkf2A1gNbWqQ7TXHlchyojFzPBrMOIKvaSuUwd4pC6JrXjmtbZVnLQW9' + 'gDKfpq7cHWnPoyT5sBM3YXIzlq06F4GDvw1MRaOJpEZU9kBX2AysnVrH6TQeY3G8oKPw5xfmLzN2' + 'hF7sJ9Qc8LbH4ErWaMKtVUXoPIjzpRy1D0qW4s3N7Km8HGaLFCvxl6eyP7UZjWopX4rBdQ2VME3G' + '9XtF8h2TsjvQnKwDYLb50O8xFI6gUJwpQmA7nrZ4EYkTXoR9CpMN8QG6fKjW5uVDl3oJ1wzBsPpT' + '2cFmLRe7Hg1SYkN8qQv9RcHjA0F3I4mU') + result = tools.get_longest_row_length(filename) + @pytest.fixture def create_temp_file(): @@ -260,3 +281,289 @@ def test_get_creation_date(create_temp_file, platform_name): assert isinstance(creation_date, str) # Assert that the returned date is in ISO 8601 format assert datetime.fromisoformat(creation_date) + + +def test_correct_duplicates(): + label_results = { + '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name<1>': 1, + '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name_1<1>': 2, + '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name_2<1>': 3, + '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name_3<1>': 4, + '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name_4<1>': 5, + '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name_5': 6 + } + + tools.correct_duplicates(label_results) + + assert label_results == { + '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name<1>': 1, + '../geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1>': 2, + '../geom:SPICE_Kernel_Identification<3>/geom:spice_kernel_file_name<1>': 3, + '../geom:SPICE_Kernel_Identification<4>/geom:spice_kernel_file_name<1>': 4, + '../geom:SPICE_Kernel_Identification<5>/geom:spice_kernel_file_name<1>': 5, + '../geom:SPICE_Kernel_Identification<6>/geom:spice_kernel_file_name<1>': 6 + } + +def test_update_nillable_elements_from_xsd_file(): + xsd_files = [] + nillable_elements_info = {} + label_files = ['test_files/labels/tester_label_1.xml', + 'test_files/labels/tester_label_2.xml'] + + for label_file in label_files: + xml_urls = tools.process_schema_location(label_file) + for url in xml_urls: + if url not in xsd_files: + xsd_files.append(url) + tools.update_nillable_elements_from_xsd_file(url, nillable_elements_info) + + assert nillable_elements_info == { + 'start_time': 'pds:ASCII_Date_Time', + 'start_date_time': 'pds:ASCII_Date_Time_YMD_UTC', + 'stop_time': 'pds:ASCII_Date_Time', + 'stop_date_time': 'pds:ASCII_Date_Time_YMD_UTC', + 'publication_date': 'pds:ASCII_Date_YMD', + 'stop_date': 'pds:ASCII_Date_YMD', + 'reference_frame_id': 'pds:ASCII_Short_String_Collapsed', + 'gain_mode_id': 'cassini:gain_mode_id_WO_Units', + 'gain_mode_id_ir': 'pds:ASCII_Short_String_Collapsed', + 'gain_mode_id_vis': 'pds:ASCII_Short_String_Collapsed', + 'wavelength_range': 'pds:ASCII_Short_String_Collapsed', + 'dsn_station_number': 'pds:ASCII_Integer'} + + +def test_update_nillable_elements_from_xsd_file_with_edge_cases(): + # Scenario 1: Testing with a type attribute that is None or already in + # nillable_elements_info + + # Mock XSD content with an element that doesn't have a 'type' attribute + xsd_content_missing_type = """ + + + + + """ + # Mock XSD content where type_attribute is already in nillable_elements_info + xsd_content_duplicate_type = """ + + + + + """ + + # Parse the mock XSD contents into XML trees + tree_missing_type = etree.fromstring(xsd_content_missing_type) + tree_duplicate_type = etree.fromstring(xsd_content_duplicate_type) + + # Mock the download_xsd_file function to return these trees based on input + with mock.patch('pds4indextools.pds4_create_xml_index.download_xsd_file') as mock_download: + # Define the behavior of the mock for each file + mock_download.side_effect = ( + lambda url: tree_missing_type if 'missing_type' in url + else tree_duplicate_type + ) + + # Initialize the dictionary that will hold the nillable elements information + nillable_elements_info = { + 'start_time': 'pds:ASCII_Date_Time' # Simulate an existing entry + } + + # Call the function with the first scenario (missing type) + tools.update_nillable_elements_from_xsd_file( + 'test_files/labels/missing_type.xsd', nillable_elements_info) + assert 'element_without_type' not in nillable_elements_info + + +def test_clean_header_field_names(): + data = { + 'column:1': [1, 2, 3], + 'column/2': [4, 5, 6], + '3': [7, 8, 9], + 'normal_column': [10, 11, 12] + } + df = pd.DataFrame(data) + + tools.clean_headers(df) + new = df.to_dict() + + assert new == { + 'column_1': {0: 1, 1: 2, 2: 3}, + 'column__2': {0: 4, 1: 5, 2: 6}, + '_column3': {0: 7, 1: 8, 2: 9}, + 'normal_column': {0: 10, 1: 11, 2: 12} + } + +def test_compute_max_field_lengths(): + + lengths = tools.compute_max_field_lengths( + str(expected_dir / 'extra_file_info_success_1.csv')) + + assert lengths == { + 'filename': 18, + 'filepath': 25, + 'pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1>': + 72, + 'pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1>': 3, + 'pds:Product_Observational/pds:Identification_Area<1>/pds:title<1>': 132 + } + + # failure + with pytest.raises(SystemExit): + lengths = tools.compute_max_field_lengths( + str(expected_dir / 'fake_file.csv')) + + +def test_sort_dataframe_key_error(): + df = pd.DataFrame({ + 'name': ['Alice', 'Bob', 'Charlie'], + 'age': [30, 25, 35] + }) + sort_keys = ['height'] # Non-existent column + + with pytest.raises(ValueError, match=f"Unknown sort key '{sort_keys[0]}'. For a list of available sort " + f"keys, use the --output-headers-file option."): + tools.sort_dataframe(df, sort_keys) + +def test_validate_label_type(): + arg = 'ancillary' + valid_choices = {'ancillary': 'Product_Ancillary', + 'metadata': 'Product_Metadata_Supplemental'} + assert tools.validate_label_type(arg, valid_choices) == 'Product_Ancillary' + + # failure + with pytest.raises(argparse.ArgumentTypeError): + arg = 'bad_label_type' + assert tools.validate_label_type(arg, valid_choices) == 'Product_Ancillary' + + +@mock.patch('os.path.exists') +def test_generate_unique_filename(mock_exists): + # Setup the mock to return True for the first two checks and False thereafter + mock_exists.side_effect = [True, True, False] + + # Run the function with a base filename + base_name = "file.txt" + result = tools.generate_unique_filename(base_name) + + # Assert that the result is what we expect given the mocked behavior + assert result == "file2.txt" # Since the first two checks return True, the counter reaches 2 + + # Ensure os.path.exists was called the expected number of times + assert mock_exists.call_count == 3 + + +import textwrap as _textwrap + +def test_fill_text(): + # Create an instance of MultilineFormatter + formatter = tools.MultilineFormatter(prog="test_prog") + + # Example input text with multiline separator + input_text = "This is a long text that should be wrapped.|nThis is a new paragraph." + + # Expected formatted output (with appropriate indentation and line wrapping) + width = 40 + indent = " " # 4 spaces + + expected_output = ( + _textwrap.fill("This is a long text that should be wrapped.", width, initial_indent=indent, subsequent_indent=indent) + '\n' + + _textwrap.fill("This is a new paragraph.", width, initial_indent=indent, subsequent_indent=indent) + '\n' + ) + + # Run the _fill_text method + result = formatter._fill_text(input_text, width, indent) + + # Assert the result matches the expected output + assert result == expected_output + +from unittest.mock import patch + +# Assume the get_true_type function is imported from the relevant module. +# from pds4indextools.pds4_create_xml_index import get_true_type + +@patch('pds4indextools.pds4_create_xml_index.download_xsd_file') +@patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') +@patch('pds4indextools.pds4_create_xml_index.find_base_attribute') +def test_true_type_found_in_first_file(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): + # Setup mocks + mock_download_xsd_file.return_value = "mock_xsd_tree" + mock_scrape_namespaces.return_value = {"mock_namespace": "mock_value"} + mock_find_base_attribute.side_effect = ["mock_true_type", None] # Found in the first file + + xsd_files = ["file1.xsd", "file2.xsd"] + tag = "mock_tag" + namespaces = {"existing_namespace": "value"} + + result = tools.get_true_type(xsd_files, tag, namespaces) + + assert result == "mock_true_type" + mock_download_xsd_file.assert_called_once_with("file1.xsd") + mock_find_base_attribute.assert_called_once_with("mock_xsd_tree", tag, {"mock_namespace": "mock_value"}) + + +@patch('pds4indextools.pds4_create_xml_index.download_xsd_file') +@patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') +@patch('pds4indextools.pds4_create_xml_index.find_base_attribute') +def test_true_type_found_in_second_file(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): + # Setup mocks + mock_download_xsd_file.return_value = "mock_xsd_tree" + mock_scrape_namespaces.return_value = {"mock_namespace": "mock_value"} + + # First file returns None for both original and modified tags + # Second file returns the true_type for the original tag + mock_find_base_attribute.side_effect = [None, None, "mock_true_type"] + + xsd_files = ["file1.xsd", "file2.xsd"] + tag = "mock_tag" + namespaces = {"existing_namespace": "value"} + + result = tools.get_true_type(xsd_files, tag, namespaces) + + print(f"Download called: {mock_download_xsd_file.call_count} times") + print(f"Find base attribute called: {mock_find_base_attribute.call_count} times") + + # Check if the loop iterates over both files and correctly identifies the type in the second file + assert result == "mock_true_type" + assert mock_download_xsd_file.call_count == 2 # Should be called for both files + assert mock_find_base_attribute.call_count == 3 # Should be called twice for file1 (original + modified) and once for file2 + + + + +@patch('pds4indextools.pds4_create_xml_index.download_xsd_file') +@patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') +@patch('pds4indextools.pds4_create_xml_index.find_base_attribute') +def test_true_type_found_with_modified_tag(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): + # Setup mocks + mock_download_xsd_file.return_value = "mock_xsd_tree" + mock_scrape_namespaces.return_value = {"mock_namespace": "mock_value"} + mock_find_base_attribute.side_effect = [None, "mock_true_type"] # Found after modifying the tag + + xsd_files = ["file1.xsd"] + tag = "mock_tag" + namespaces = {"existing_namespace": "value"} + + result = tools.get_true_type(xsd_files, tag, namespaces) + + assert result == "mock_true_type" + mock_find_base_attribute.assert_any_call("mock_xsd_tree", "mock_tag_WO_Units", {"mock_namespace": "mock_value"}) + + +@patch('pds4indextools.pds4_create_xml_index.download_xsd_file') +@patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') +@patch('pds4indextools.pds4_create_xml_index.find_base_attribute') +def test_true_type_not_found(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): + # Setup mocks + mock_download_xsd_file.return_value = "mock_xsd_tree" + mock_scrape_namespaces.return_value = {"mock_namespace": "mock_value"} + mock_find_base_attribute.return_value = None # Never found + + xsd_files = ["file1.xsd", "file2.xsd"] + tag = "mock_tag" + namespaces = {"existing_namespace": "value"} + + result = tools.get_true_type(xsd_files, tag, namespaces) + + assert result == None + assert mock_download_xsd_file.call_count == 2 + assert mock_find_base_attribute.call_count == 4 # Both original and modified tags are checked for both files From e1cf1f9c01d9e901a3b38f0be1982c091c07a4fb Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Mon, 19 Aug 2024 14:17:01 -0700 Subject: [PATCH 03/24] Adding missing docstrings --- pds4indextools/pds4_create_xml_index.py | 68 ++++++++++++++++++++----- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index 96d4d12..4db41fd 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -54,7 +54,7 @@ def convert_header_to_xpath(root, xml_header_path, namespaces): Parameters: root (Element): The root element of the XML document. xml_header_path (str): Original XML header path. - namespaces (dict): Dictionary of XML namespace mappings. + namespaces (dict): A dictionary containing XML namespace mappings. Returns: str: Converted XPath expression. @@ -300,20 +300,32 @@ def filter_dict_by_glob_patterns(input_dict, glob_patterns, valid_add_extra_file def get_true_type(xsd_files, tag, namespaces): + """ + Determines the true type of a specified tag by searching through a list of XSD files. + + This function iterates through the provided list of XSD files and attempts to find the + "true type" of the given XML tag by examining its attributes and base types. If the + type is not found with the original tag, a modified version of the tag is also + checked. + + Parameters: + xsd_files (list): A list of file paths or URLs to the XSD files. + tag (str): The XML tag to search for within the XSD files. + namespaces (dict): A dictionary containing XML namespace mappings. + + Returns: + str or None: The "true type" of the tag if found, otherwise `None`. + """ def search_type(xsd_file, tag, namespaces): - print(f"Processing file: {xsd_file}") xsd_tree = download_xsd_file(xsd_file) namespaces = scrape_namespaces(xsd_tree) true_type = find_base_attribute(xsd_tree, tag, namespaces) if true_type: - print(f"Found true_type for tag '{tag}' in file: {xsd_file}") return true_type # Check for modified tag if the first search does not find a match modified_tag = tag + "_WO_Units" true_type = find_base_attribute(xsd_tree, modified_tag, namespaces) - if true_type: - print(f"Found true_type for modified tag '{modified_tag}' in file: {xsd_file}") return true_type # This will return either the found type or None for xsd_file in xsd_files: @@ -322,8 +334,7 @@ def search_type(xsd_file, tag, namespaces): print(f"Returning true_type found in file: {xsd_file}") return true_type - print("No true_type found in any file.") - return None # Return None if no match is found in any file + return None def load_config_file( @@ -926,11 +937,44 @@ def scrape_namespaces(tree): def sort_dataframe(df, sort_keys): - try: - df.sort_values(by=sort_keys, inplace=True) - except KeyError as bad_sort: - raise ValueError(f'Unknown sort key {bad_sort}. For a list of available sort ' - f'keys, use the --output-headers-file option.') + """ + Sorts a DataFrame based on specified keys. + + This function sorts the input DataFrame in place using the provided sort keys. + If an invalid key is provided, a `ValueError` is raised with a message indicating + the unknown key and suggesting how to obtain a list of valid keys. + + Parameters: + df (pandas.DataFrame): The DataFrame to be sorted. + sort_keys (str or list of str): The column name(s) to sort the DataFrame by. + Can be a single string or a list of strings. + + Raises: + ValueError: If any of the provided sort keys are not found in the DataFrame, + a `ValueError` is raised with a descriptive error message. + + Example: + >>> df = pd.DataFrame({ + ... 'name': ['Alice', 'Bob', 'Charlie'], + ... 'age': [25, 30, 22] + ... }) + >>> sort_keys = ['age'] + >>> sort_dataframe(df, sort_keys) + >>> print(df) + name age + 2 Charlie 22 + 0 Alice 25 + 1 Bob 30 + + Notes: + - The sorting is done in place, so the original DataFrame is modified. + - The function will raise an error if any of the specified sort keys are invalid. + """ + try: + df.sort_values(by=sort_keys, inplace=True) + except KeyError as bad_sort: + raise ValueError(f'Unknown sort key {bad_sort}. For a list of available sort ' + f'keys, use the --output-headers-file option.') def get_creation_date(file_path): From 6edcafcb333fec6c6f14e7216430d85ed1daaba5 Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Mon, 19 Aug 2024 14:38:51 -0700 Subject: [PATCH 04/24] Making everything flake8 compliant --- pds4indextools/pds4_create_xml_index.py | 51 ++++----- tests/test_pds4_create_xml_index_blackbox.py | 29 +++--- tests/test_pds4_create_xml_index_whitebox.py | 104 +++++++++++-------- 3 files changed, 96 insertions(+), 88 deletions(-) diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index 4db41fd..956eed4 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -303,7 +303,7 @@ def get_true_type(xsd_files, tag, namespaces): """ Determines the true type of a specified tag by searching through a list of XSD files. - This function iterates through the provided list of XSD files and attempts to find the + This function iterates through the provided list of XSD files and attempts to find the "true type" of the given XML tag by examining its attributes and base types. If the type is not found with the original tag, a modified version of the tag is also checked. @@ -725,7 +725,7 @@ def update_nillable_elements_from_xsd_file(xsd_file, nillable_elements_info): # Attempt to find the type definition in the document type_definition_xpath = (f'//xs:simpleType[@name="{type_name}"] | ' - f'//xs:complexType[@name="{type_name}"]') + f'//xs:complexType[@name="{type_name}"]') type_definition = tree.xpath( type_definition_xpath, namespaces=namespace) @@ -737,12 +737,12 @@ def update_nillable_elements_from_xsd_file(xsd_file, nillable_elements_info): try: restriction = type_definition.find('.//xs:restriction', - namespaces=namespace) + namespaces=namespace) base_type = restriction.get('base') except AttributeError: extension = type_definition.find('.//xs:extension', - namespaces=namespace) + namespaces=namespace) base_type = extension.get('base') nillable_elements_info[name] = ( @@ -793,7 +793,6 @@ def pad_column_values_and_headers(df): return padded_df - rows = [] for result_dict in results_list: rows.append(result_dict) @@ -808,7 +807,6 @@ def pad_column_values_and_headers(df): print(bad_sort) sys.exit(1) - if args.clean_header_field_names: clean_headers(df) @@ -848,7 +846,6 @@ def find_base_attribute(xsd_tree, target_name, new_namespaces): } namespaces.update(new_namespaces) - def get_base_type(query): """ Executes an XPath query to find the base type. @@ -940,17 +937,17 @@ def sort_dataframe(df, sort_keys): """ Sorts a DataFrame based on specified keys. - This function sorts the input DataFrame in place using the provided sort keys. - If an invalid key is provided, a `ValueError` is raised with a message indicating + This function sorts the input DataFrame in place using the provided sort keys. + If an invalid key is provided, a `ValueError` is raised with a message indicating the unknown key and suggesting how to obtain a list of valid keys. Parameters: df (pandas.DataFrame): The DataFrame to be sorted. - sort_keys (str or list of str): The column name(s) to sort the DataFrame by. + sort_keys (str or list of str): The column name(s) to sort the DataFrame by. Can be a single string or a list of strings. Raises: - ValueError: If any of the provided sort keys are not found in the DataFrame, + ValueError: If any of the provided sort keys are not found in the DataFrame, a `ValueError` is raised with a descriptive error message. Example: @@ -974,10 +971,10 @@ def sort_dataframe(df, sort_keys): df.sort_values(by=sort_keys, inplace=True) except KeyError as bad_sort: raise ValueError(f'Unknown sort key {bad_sort}. For a list of available sort ' - f'keys, use the --output-headers-file option.') + f'keys, use the --output-headers-file option.') -def get_creation_date(file_path): +def get_creation_date(file_path): """ Returns the creation date of a file in ISO 8601 format. @@ -995,7 +992,7 @@ def get_creation_date(file_path): stat = os.stat(file_path) try: creation_time = stat.st_birthtime - except AttributeError: # pragma: no coverage + except AttributeError: # pragma: no coverage # Fallback to the last modification time if birth time is not available creation_time = stat.st_mtime @@ -1168,7 +1165,7 @@ def _fill_text(self, text, width, indent): def main(cmd_line=None): epilog_sfx = '' - if __version__ != 'Version unspecified': # pragma: no coverage + if __version__ != 'Version unspecified': # pragma: no coverage epilog_sfx = f'|nVersion: {__version__}' parser = argparse.ArgumentParser( formatter_class=MultilineFormatter, @@ -1312,14 +1309,14 @@ def main(cmd_line=None): for pattern in patterns: files = directory_path.glob(pattern) - + # Create an iterator from the generator files_iter = iter(files) - + # Use a sentinel object to check if there's any item sentinel = object() first_file = next(files_iter, sentinel) - + if first_file is sentinel: print(f"No files found for pattern: {pattern}") else: @@ -1368,7 +1365,7 @@ def main(cmd_line=None): filepath = str(label_file.relative_to(args.directorypath)).replace('\\', '/') # PDS4 compliant filepaths must be less than 255 characters. - if len(filepath) > 255: # pragma: no coverage + if len(filepath) > 255: # pragma: no coverage print(f'Filepath {filepath} exceeds 255 character limit.') sys.exit(1) @@ -1382,7 +1379,7 @@ def main(cmd_line=None): label_results = {} traverse_and_store(root, tree, label_results, xsd_files, nillable_elements_info, config, label_file) - + # # The XPath headers in the label_results dictionary are reformatted to # # improve readability. Each XPath's namespace is replaced with its prefix for # # faster reference. Duplicate XPaths are made unique to ensure all results are @@ -1444,7 +1441,6 @@ def main(cmd_line=None): all_results.append(label_results) - if args.add_extra_file_info and elements_to_scrape is not None: elements_to_scrape = args.add_extra_file_info + elements_to_scrape @@ -1452,7 +1448,7 @@ def main(cmd_line=None): # of the --limit-xpaths-file input file. If this command is not used, the original # dictionary will be returned. Glob patterns are processed sequentially, with the # first pattern having the highest priority. - + for label_results in all_results: ind = all_results.index(label_results) label_results_new = filter_dict_by_glob_patterns( @@ -1471,7 +1467,7 @@ def main(cmd_line=None): headers = {} unique_tags_master = [] - # Step 1: Gather all possible tags from labels + # Step 1: Gather all possible tags from labels for label_results in all_results: keys = label_results.keys() for key in keys: @@ -1495,11 +1491,9 @@ def main(cmd_line=None): for tag in tags: name = tag.split('<')[0] if (tags.count(tag) == 1 and names.count(name) == 1 - and tag not in unique_tags): + and tag not in unique_tags): unique_tags.append(tag) - # if tags.count(tag) > 1 and tag in unique_tags_master: - # unique_tags_master.remove(tag) - + for tag in unique_tags: unique_tags_master.append(tag) @@ -1515,7 +1509,6 @@ def main(cmd_line=None): all_results[ind] = new_label_results - if output_csv_path: write_results_to_csv(all_results, args, output_csv_path) @@ -1663,5 +1656,5 @@ def main(cmd_line=None): template.write(label_content, str(output_subdir / filename) + '.xml') -if __name__ == '__main__': # pragma: no coverage +if __name__ == '__main__': # pragma: no coverage main() diff --git a/tests/test_pds4_create_xml_index_blackbox.py b/tests/test_pds4_create_xml_index_blackbox.py index f71e7ed..a672e9b 100644 --- a/tests/test_pds4_create_xml_index_blackbox.py +++ b/tests/test_pds4_create_xml_index_blackbox.py @@ -2,7 +2,6 @@ import pytest import os import tempfile -import shutil import pds4indextools.pds4_create_xml_index as tools @@ -18,14 +17,14 @@ @pytest.mark.parametrize( 'golden_file,new_file_index,new_file_headers,cmd_line', [ - #Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" ( str(expected_dir / 'index_file_success.csv'), None, None, [] ), - #Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary ( str(expected_dir / 'index_file_success.csv'), None, None, @@ -455,7 +454,7 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): with tempfile.TemporaryDirectory(dir=test_files_dir.parent) as temp_dir: temp_dir_path = Path(temp_dir) - if new_file_index == None and new_file_headers == None: + if new_file_index is None and new_file_headers is None: os.chdir(temp_dir_path) cmd_line.append(str(test_files_dir)) cmd_line.append(str(labels_dir.name / Path('tester_label_1.xml'))) @@ -572,9 +571,9 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): 'hdout.txt' ), - #Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --simplify-xpaths --sort-by bad_sort --output-headers-file hdout.csv + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --simplify-xpaths --sort-by bad_sort --output-headers-file hdout.csv ( - str(test_files_dir), + str(test_files_dir), str(labels_dir.name / Path('tester_label_1.xml')), '--simplify-xpaths', '--sort-by', @@ -583,17 +582,17 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): 'hdout.csv' ), - #Executable command: pds4_create_xml_index ../test_files/labels "nonexistent.xml" --output-headers-file hdout.txt + # Executable command: pds4_create_xml_index ../test_files/labels "nonexistent.xml" --output-headers-file hdout.txt ( - str(test_files_dir), + str(test_files_dir), str(labels_dir.name / Path('nonexistent.xml')), '--output-headers-file', 'hdout.txt', ), - #Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/elements_xpath_simplify_3.txt --output-headers-file hdout.txt + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/elements_xpath_simplify_3.txt --output-headers-file hdout.txt ( - str(test_files_dir), + str(test_files_dir), str(labels_dir.name / Path('tester_label_1.xml')), '--limit-xpaths-file', str(samples_dir / 'elements_xpath_simplify_3.txt'), @@ -601,9 +600,9 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): 'hdout.txt', ), - #Executable command: pds4_create_xml_index ../test_files/labels "tester_label_*.xml" --generate-label ancillary --output-headers-file hdout.txt + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_*.xml" --generate-label ancillary --output-headers-file hdout.txt ( - str(test_files_dir), + str(test_files_dir), str(labels_dir.name / Path('tester_label_*.xml')), '--generate-label', 'ancillary', @@ -611,9 +610,9 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): 'hdout.txt', ), - #Executable command: pds4_create_xml_index ../test_files/labels "bad_lid_label.xml" --output-headers-file hdout.txt + # Executable command: pds4_create_xml_index ../test_files/labels "bad_lid_label.xml" --output-headers-file hdout.txt ( - str(test_files_dir), + str(test_files_dir), str(labels_dir.name / Path('bad_lid_label.xml')), '--output-headers-file', 'hdout.txt', @@ -630,6 +629,7 @@ def test_failures(cmd_line): if os.path.isfile('hdout.txt'): os.remove('hdout.txt') + @pytest.mark.parametrize( 'new_file,cmd_line', [ @@ -666,6 +666,7 @@ def test_failure_message(capfd, new_file, cmd_line): expected_message = ("Non-nillable element in") assert expected_message in captured.out or expected_message in captured.err + def test_invalid_arguments(): with pytest.raises(SystemExit): # Assuming argparse will call sys.exit on failure tools.main(["--invalid-option"]) diff --git a/tests/test_pds4_create_xml_index_whitebox.py b/tests/test_pds4_create_xml_index_whitebox.py index cf9f9d2..ada7110 100644 --- a/tests/test_pds4_create_xml_index_whitebox.py +++ b/tests/test_pds4_create_xml_index_whitebox.py @@ -6,7 +6,9 @@ from pathlib import Path import pytest import pds4indextools.pds4_create_xml_index as tools +import textwrap as _textwrap from unittest import mock +from unittest.mock import patch # These two variables are the same for all tests, so we can either declare them as @@ -96,7 +98,8 @@ def test_load_config_object(): specified_config_files=[str(expected_dir/'tester_config_label.yaml'),]) assert config_object['label-contents']['version_id'] == '1.0' - assert config_object['label-contents']['title'] == 'Index file for my occultation bundle' + assert (config_object['label-contents']['title'] == + 'Index file for my occultation bundle') # A bad default config file with pytest.raises(SystemExit): @@ -153,9 +156,9 @@ def test_default_value_for_nil(): '0004-01-01T12:00Z') assert tools.default_value_for_nil(config_object, datetime_ymd_utc, 'anticipated') == '0004-01-01T12:00Z' - + # Testing None - assert tools.default_value_for_nil(config_object, None, 'anticipated') == None + assert tools.default_value_for_nil(config_object, None, 'anticipated') is None def test_default_value_for_nil_ascii_date_time_ymd_utc(): @@ -292,7 +295,7 @@ def test_correct_duplicates(): '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name_4<1>': 5, '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name_5': 6 } - + tools.correct_duplicates(label_results) assert label_results == { @@ -304,12 +307,13 @@ def test_correct_duplicates(): '../geom:SPICE_Kernel_Identification<6>/geom:spice_kernel_file_name<1>': 6 } + def test_update_nillable_elements_from_xsd_file(): xsd_files = [] nillable_elements_info = {} label_files = ['test_files/labels/tester_label_1.xml', 'test_files/labels/tester_label_2.xml'] - + for label_file in label_files: xml_urls = tools.process_schema_location(label_file) for url in xml_urls: @@ -335,7 +339,7 @@ def test_update_nillable_elements_from_xsd_file(): def test_update_nillable_elements_from_xsd_file_with_edge_cases(): # Scenario 1: Testing with a type attribute that is None or already in # nillable_elements_info - + # Mock XSD content with an element that doesn't have a 'type' attribute xsd_content_missing_type = """ @@ -356,7 +360,9 @@ def test_update_nillable_elements_from_xsd_file_with_edge_cases(): tree_duplicate_type = etree.fromstring(xsd_content_duplicate_type) # Mock the download_xsd_file function to return these trees based on input - with mock.patch('pds4indextools.pds4_create_xml_index.download_xsd_file') as mock_download: + with mock.patch( + 'pds4indextools.pds4_create_xml_index.download_xsd_file' + ) as mock_download: # Define the behavior of the mock for each file mock_download.side_effect = ( lambda url: tree_missing_type if 'missing_type' in url @@ -382,7 +388,7 @@ def test_clean_header_field_names(): 'normal_column': [10, 11, 12] } df = pd.DataFrame(data) - + tools.clean_headers(df) new = df.to_dict() @@ -392,7 +398,8 @@ def test_clean_header_field_names(): '_column3': {0: 7, 1: 8, 2: 9}, 'normal_column': {0: 10, 1: 11, 2: 12} } - + + def test_compute_max_field_lengths(): lengths = tools.compute_max_field_lengths( @@ -406,12 +413,11 @@ def test_compute_max_field_lengths(): 'pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1>': 3, 'pds:Product_Observational/pds:Identification_Area<1>/pds:title<1>': 132 } - + # failure with pytest.raises(SystemExit): - lengths = tools.compute_max_field_lengths( - str(expected_dir / 'fake_file.csv')) - + lengths = tools.compute_max_field_lengths(str(expected_dir / 'fake_file.csv')) + def test_sort_dataframe_key_error(): df = pd.DataFrame({ @@ -420,10 +426,12 @@ def test_sort_dataframe_key_error(): }) sort_keys = ['height'] # Non-existent column - with pytest.raises(ValueError, match=f"Unknown sort key '{sort_keys[0]}'. For a list of available sort " - f"keys, use the --output-headers-file option."): + with pytest.raises(ValueError, match=f"Unknown sort key '{sort_keys[0]}'. For a list " + f"of available sort keys, use the " + f"--output-headers-file option."): tools.sort_dataframe(df, sort_keys) + def test_validate_label_type(): arg = 'ancillary' valid_choices = {'ancillary': 'Product_Ancillary', @@ -446,14 +454,13 @@ def test_generate_unique_filename(mock_exists): result = tools.generate_unique_filename(base_name) # Assert that the result is what we expect given the mocked behavior - assert result == "file2.txt" # Since the first two checks return True, the counter reaches 2 + # Since the first two checks return True, the counter reaches 2 + assert result == "file2.txt" # Ensure os.path.exists was called the expected number of times assert mock_exists.call_count == 3 -import textwrap as _textwrap - def test_fill_text(): # Create an instance of MultilineFormatter formatter = tools.MultilineFormatter(prog="test_prog") @@ -466,8 +473,10 @@ def test_fill_text(): indent = " " # 4 spaces expected_output = ( - _textwrap.fill("This is a long text that should be wrapped.", width, initial_indent=indent, subsequent_indent=indent) + '\n' + - _textwrap.fill("This is a new paragraph.", width, initial_indent=indent, subsequent_indent=indent) + '\n' + _textwrap.fill("This is a long text that should be wrapped.", + width, initial_indent=indent, subsequent_indent=indent) + '\n' + + _textwrap.fill("This is a new paragraph.", width, initial_indent=indent, + subsequent_indent=indent) + '\n' ) # Run the _fill_text method @@ -476,39 +485,40 @@ def test_fill_text(): # Assert the result matches the expected output assert result == expected_output -from unittest.mock import patch # Assume the get_true_type function is imported from the relevant module. # from pds4indextools.pds4_create_xml_index import get_true_type - @patch('pds4indextools.pds4_create_xml_index.download_xsd_file') @patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') @patch('pds4indextools.pds4_create_xml_index.find_base_attribute') -def test_true_type_found_in_first_file(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): +def test_true_type_found_in_first_file(mock_find_base_attribute, mock_scrape_namespaces, + mock_download_xsd_file): # Setup mocks mock_download_xsd_file.return_value = "mock_xsd_tree" mock_scrape_namespaces.return_value = {"mock_namespace": "mock_value"} - mock_find_base_attribute.side_effect = ["mock_true_type", None] # Found in the first file + mock_find_base_attribute.side_effect = ["mock_true_type", None] xsd_files = ["file1.xsd", "file2.xsd"] tag = "mock_tag" namespaces = {"existing_namespace": "value"} - + result = tools.get_true_type(xsd_files, tag, namespaces) - + assert result == "mock_true_type" mock_download_xsd_file.assert_called_once_with("file1.xsd") - mock_find_base_attribute.assert_called_once_with("mock_xsd_tree", tag, {"mock_namespace": "mock_value"}) + mock_find_base_attribute.assert_called_once_with("mock_xsd_tree", tag, + {"mock_namespace": "mock_value"}) @patch('pds4indextools.pds4_create_xml_index.download_xsd_file') @patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') @patch('pds4indextools.pds4_create_xml_index.find_base_attribute') -def test_true_type_found_in_second_file(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): +def test_true_type_found_in_second_file(mock_find_base_attribute, mock_scrape_namespaces, + mock_download_xsd_file): # Setup mocks mock_download_xsd_file.return_value = "mock_xsd_tree" mock_scrape_namespaces.return_value = {"mock_namespace": "mock_value"} - + # First file returns None for both original and modified tags # Second file returns the true_type for the original tag mock_find_base_attribute.side_effect = [None, None, "mock_true_type"] @@ -516,43 +526,47 @@ def test_true_type_found_in_second_file(mock_find_base_attribute, mock_scrape_na xsd_files = ["file1.xsd", "file2.xsd"] tag = "mock_tag" namespaces = {"existing_namespace": "value"} - + result = tools.get_true_type(xsd_files, tag, namespaces) print(f"Download called: {mock_download_xsd_file.call_count} times") print(f"Find base attribute called: {mock_find_base_attribute.call_count} times") - # Check if the loop iterates over both files and correctly identifies the type in the second file + # Check if the loop iterates over both files and correctly identifies the type in + # the second file assert result == "mock_true_type" - assert mock_download_xsd_file.call_count == 2 # Should be called for both files - assert mock_find_base_attribute.call_count == 3 # Should be called twice for file1 (original + modified) and once for file2 - - + assert mock_download_xsd_file.call_count == 2 + assert mock_find_base_attribute.call_count == 3 @patch('pds4indextools.pds4_create_xml_index.download_xsd_file') @patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') @patch('pds4indextools.pds4_create_xml_index.find_base_attribute') -def test_true_type_found_with_modified_tag(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): +def test_true_type_found_with_modified_tag(mock_find_base_attribute, + mock_scrape_namespaces, + mock_download_xsd_file): # Setup mocks mock_download_xsd_file.return_value = "mock_xsd_tree" mock_scrape_namespaces.return_value = {"mock_namespace": "mock_value"} - mock_find_base_attribute.side_effect = [None, "mock_true_type"] # Found after modifying the tag + # Found after modifying the tag + mock_find_base_attribute.side_effect = [None, "mock_true_type"] xsd_files = ["file1.xsd"] tag = "mock_tag" namespaces = {"existing_namespace": "value"} - + result = tools.get_true_type(xsd_files, tag, namespaces) - + assert result == "mock_true_type" - mock_find_base_attribute.assert_any_call("mock_xsd_tree", "mock_tag_WO_Units", {"mock_namespace": "mock_value"}) + mock_find_base_attribute.assert_any_call("mock_xsd_tree", "mock_tag_WO_Units", + {"mock_namespace": "mock_value"}) @patch('pds4indextools.pds4_create_xml_index.download_xsd_file') @patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') @patch('pds4indextools.pds4_create_xml_index.find_base_attribute') -def test_true_type_not_found(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): +def test_true_type_not_found(mock_find_base_attribute, mock_scrape_namespaces, + mock_download_xsd_file): # Setup mocks mock_download_xsd_file.return_value = "mock_xsd_tree" mock_scrape_namespaces.return_value = {"mock_namespace": "mock_value"} @@ -561,9 +575,9 @@ def test_true_type_not_found(mock_find_base_attribute, mock_scrape_namespaces, m xsd_files = ["file1.xsd", "file2.xsd"] tag = "mock_tag" namespaces = {"existing_namespace": "value"} - + result = tools.get_true_type(xsd_files, tag, namespaces) - - assert result == None + + assert result is None assert mock_download_xsd_file.call_count == 2 - assert mock_find_base_attribute.call_count == 4 # Both original and modified tags are checked for both files + assert mock_find_base_attribute.call_count == 4 From fe85192639e31394437810887bc67b98200372dc Mon Sep 17 00:00:00 2001 From: Robert French Date: Mon, 19 Aug 2024 16:52:23 -0700 Subject: [PATCH 05/24] Remove terminator dependence from tests --- tests/test_pds4_create_xml_index_blackbox.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_pds4_create_xml_index_blackbox.py b/tests/test_pds4_create_xml_index_blackbox.py index a672e9b..ff944eb 100644 --- a/tests/test_pds4_create_xml_index_blackbox.py +++ b/tests/test_pds4_create_xml_index_blackbox.py @@ -466,10 +466,10 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): assert os.path.isfile(path_to_file) # Open and compare the two files - with open(path_to_file, 'rb') as created: + with open(path_to_file, 'r') as created: formed = created.read() - with open(golden_file, 'rb') as new: + with open(golden_file, 'r') as new: expected = new.read() assert formed == expected @@ -488,10 +488,10 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): assert os.path.isfile(path_to_file) # Open and compare the two files - with open(path_to_file, 'rb') as created: + with open(path_to_file, 'r') as created: formed = created.read() - with open(golden_file, 'rb') as new: + with open(golden_file, 'r') as new: expected = new.read() assert formed == expected @@ -502,10 +502,10 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): assert os.path.isfile(label_path) # Open and compare the two files - with open(label_path, 'rb') as created: + with open(label_path, 'r') as created: formed = created.read() - with open(golden_label, 'rb') as new: + with open(golden_label, 'r') as new: expected = new.read() assert formed == expected @@ -521,10 +521,10 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): assert os.path.isfile(path_to_file) # Open and compare the two files - with open(path_to_file, 'rb') as created: + with open(path_to_file, 'r') as created: formed = created.read() - with open(golden_file, 'rb') as new: + with open(golden_file, 'r') as new: expected = new.read() assert formed == expected From 641c87c2c8c92cf775b86def66b9487f8e63606a Mon Sep 17 00:00:00 2001 From: Robert French Date: Mon, 19 Aug 2024 17:08:40 -0700 Subject: [PATCH 06/24] Force \n line terminator on writing CSV --- pds4indextools/pds4_create_xml_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index 956eed4..e2002f6 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -817,7 +817,7 @@ def pad_column_values_and_headers(df): else: print(f'Index file generated at {output_csv_path}') - df.to_csv(output_csv_path, index=False, na_rep='') + df.to_csv(output_csv_path, index=False, na_rep='', lineterminator='\n') def find_base_attribute(xsd_tree, target_name, new_namespaces): From 4fa4dc6282aa50fd5d91409db087ba5818af07d9 Mon Sep 17 00:00:00 2001 From: Robert French Date: Mon, 19 Aug 2024 17:11:10 -0700 Subject: [PATCH 07/24] Missed a to_csv --- pds4indextools/pds4_create_xml_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index e2002f6..3c25688 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -813,7 +813,7 @@ def pad_column_values_and_headers(df): if args.fixed_width: padded_df = pad_column_values_and_headers(df) print(f'Fixed-width index file generated at {output_csv_path}') - padded_df.to_csv(output_csv_path, index=False, na_rep='') + padded_df.to_csv(output_csv_path, index=False, na_rep='', lineterminator='\n') else: print(f'Index file generated at {output_csv_path}') From 328b3395a7fb6e0ab9097700273abad05126291d Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Tue, 20 Aug 2024 09:22:07 -0700 Subject: [PATCH 08/24] Removing --rename-headers and --dont-number-unique-tags --- pds4indextools/pds4_create_xml_index.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index 956eed4..d28d904 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -331,7 +331,6 @@ def search_type(xsd_file, tag, namespaces): for xsd_file in xsd_files: true_type = search_type(xsd_file, tag, namespaces) if true_type: # Only return if true_type is not None - print(f"Returning true_type found in file: {xsd_file}") return true_type return None @@ -1223,15 +1222,6 @@ def main(cmd_line=None): 'contain characters permissible in variable ' 'names.') - index_file_generation.add_argument('--rename-headers', type=str, - metavar='NEW_HEADERS_FILEPATH', - help='Rename headers in the generated index file' - 'according to a given mapping file.') - - index_file_generation.add_argument('--dont-number-unique-tags', action='store_true', - help='Removes the predicates of unique XPath ' - 'headers.') - index_file_generation.add_argument( '--simplify-xpaths', action='store_true', From 82294e80031bdf8c9f2b627f515fbfa862b722a3 Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Mon, 26 Aug 2024 13:21:37 -0700 Subject: [PATCH 09/24] Minor syntax changes, fixed issue with label generation --- pds4indextools/index_label_template_pds.xml | 22 + pds4indextools/pds4_create_xml_index.py | 1 - test_files/expected/label_success_1.xml | 2 +- test_files/expected/label_success_2.xml | 2 +- test_files/expected/label_success_3.xml | 2 +- test_files/expected/tester_config.yaml | 2 +- tests/test_pds4_create_xml_index_blackbox.py | 1048 +++++++++--------- tests/test_pds4_create_xml_index_whitebox.py | 57 +- 8 files changed, 569 insertions(+), 567 deletions(-) diff --git a/pds4indextools/index_label_template_pds.xml b/pds4indextools/index_label_template_pds.xml index 89b6bb6..982db44 100644 --- a/pds4indextools/index_label_template_pds.xml +++ b/pds4indextools/index_label_template_pds.xml @@ -103,6 +103,28 @@ $END_IF $END_IF $IF(Product_Metadata_Supplemental) + $END_IF + $IF(Product_Ancillary and isinstance(Product_Ancillary, dict) and 'File_Area_Ancillary' in Product_Ancillary) + $IF(EXISTS(File)) + + $BASENAME(TEMPFILE)$ + index-table + $File['creation_date_time']$ + $File['md5_checksum']$ + + + $END_IF + $END_IF + $IF(Product_Metadata_Supplemental and isinstance(Product_Metadata_Supplemental, dict) and 'File_Area_Ancillary' in Product_Metadata_Supplemental) + $IF(File) + + $BASENAME(TEMPFILE)$ + index-table + $File['creation_date_time']$ + $File['md5_checksum']$ + + + $END_IF $END_IF $BASENAME(TEMPFILE)$ diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index 3dfabf5..34ca2ff 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -1532,7 +1532,6 @@ def main(cmd_line=None): if args.generate_label: index_file = output_csv_path - print(index_file) # The template label file is initialized. module_dir = Path(__file__).resolve().parent diff --git a/test_files/expected/label_success_1.xml b/test_files/expected/label_success_1.xml index 6d4945f..aade34b 100644 --- a/test_files/expected/label_success_1.xml +++ b/test_files/expected/label_success_1.xml @@ -26,7 +26,7 @@ generated_label_1.csv index-table - 00:00:00 + 0001-01-01T00:00:00.00Z a177a1160bf3780c01e3bd9e02be89f4 diff --git a/test_files/expected/label_success_2.xml b/test_files/expected/label_success_2.xml index b5bed6e..628ae41 100644 --- a/test_files/expected/label_success_2.xml +++ b/test_files/expected/label_success_2.xml @@ -26,7 +26,7 @@ generated_label_2.csv index-table - 00:00:00 + 0001-01-01T00:00:00.00Z 53d47b320936ac3fbba0852696065418 diff --git a/test_files/expected/label_success_3.xml b/test_files/expected/label_success_3.xml index 96cc903..2e6127e 100644 --- a/test_files/expected/label_success_3.xml +++ b/test_files/expected/label_success_3.xml @@ -26,7 +26,7 @@ generated_label_3.csv index-table - 00:00:00 + 0001-01-01T00:00:00.00Z 8b2eb69a284938d23748de7f53d2e45b diff --git a/test_files/expected/tester_config.yaml b/test_files/expected/tester_config.yaml index 35d3d8c..7b29fd8 100644 --- a/test_files/expected/tester_config.yaml +++ b/test_files/expected/tester_config.yaml @@ -20,4 +20,4 @@ nillable: label-contents: version_id: 1.1 File: - creation_date_time: '00:00:00' + creation_date_time: '0001-01-01T00:00:00.00Z' diff --git a/tests/test_pds4_create_xml_index_blackbox.py b/tests/test_pds4_create_xml_index_blackbox.py index ff944eb..949f9eb 100644 --- a/tests/test_pds4_create_xml_index_blackbox.py +++ b/tests/test_pds4_create_xml_index_blackbox.py @@ -2,545 +2,527 @@ import pytest import os import tempfile +import shutil import pds4indextools.pds4_create_xml_index as tools # These two variables are the same for all tests, so we can either declare them as # global variables, or get the ROOT_DIR at the setup stage before running each test ROOT_DIR = Path(__file__).resolve().parent.parent -test_files_dir = ROOT_DIR / 'test_files' -samples_dir = test_files_dir / 'samples' -expected_dir = test_files_dir / 'expected' -labels_dir = test_files_dir / 'labels' +TEST_FILES_DIR = ROOT_DIR / 'test_files' +SAMPLES_DIR = TEST_FILES_DIR / 'samples' +EXPECTED_DIR = TEST_FILES_DIR / 'expected' +LABELS_DIR = TEST_FILES_DIR / 'labels' +LABEL_NAME = LABELS_DIR.name + + +def compare_files(path_to_file, golden_file): + # Assert that the file now exists + assert os.path.isfile(path_to_file) + + # Open and compare the two files + with open(path_to_file, 'r') as created: + formed = created.read() + + with open(golden_file, 'r') as new: + expected = new.read() + + assert formed == expected @pytest.mark.parametrize( - 'golden_file,new_file_index,new_file_headers,cmd_line', - [ - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" - ( - str(expected_dir / 'index_file_success.csv'), - None, None, - [] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary - ( - str(expected_dir / 'index_file_success.csv'), - None, None, - [ - '--generate-label', - 'ancillary' - ] - ), - - # Testing --limit-xpaths-file with two outputs - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --output-headers-file limit_xpaths_file.txt --output-index-file limit_xpaths_file.csv - # Compare result to golden copy: - # test_files/expected/limit_xpaths_file_success_1.txt - ( - str(expected_dir / 'limit_xpaths_file_success_1.csv'), - 'limit_xpaths_file.csv', 'limit_xpaths_file.txt', - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - '--limit-xpaths-file', - str(samples_dir / 'element_1.txt') - ] - ), - - # Testing --limit-xpaths-file - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --output-headers-file limit_xpaths_file.txt - # Compare result to golden copy: - # test_files/expected/limit_xpaths_file_success_1.txt - ( - str(expected_dir / 'limit_xpaths_file_success_1.txt'), - None, 'limit_xpaths_file.txt', - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - '--limit-xpaths-file', - str(samples_dir / 'element_1.txt') - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --output-headers-file limit_xpaths_file.txt - # Compare result to golden copy: - # test_files/expected/limit_xpaths_file_success_1.txt - ( - str(expected_dir / 'limit_xpaths_file_success_1.txt'), - None, 'limit_xpaths_file_wack.txt', - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - str(labels_dir.name / Path('nonexistent.xml')), - '--limit-xpaths-file', - str(samples_dir / 'element_1.txt') - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_2.xml" --limit-xpaths-file ../test_files/samples/element_2.txt --output-headers-file limit_xpaths_file_2.txt - # Compare result to golden copy: - # test_files/expected/limit_xpaths_file_success_2.txt - ( - str(expected_dir / 'limit_xpaths_file_success_2.txt'), - None, 'limit_xpaths_file_2.txt', - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_2.xml')), - '--limit-xpaths-file', - str(samples_dir / 'element_2.txt') - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_2.xml" --limit-xpaths-file ../test_files/samples/element_duplicates.txt --output-headers-file elements_dupe_file_2.txt - # Compare result to golden copy: - # test_files/expected/limit_xpaths_file_success_2.txt - ( - str(expected_dir / 'limit_xpaths_file_success_2.txt'), - None, 'elements_dupe_file_2.txt', - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_2.xml')), - '--limit-xpaths-file', - str(samples_dir / 'element_duplicates.txt') - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_2.xml" tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_3.txt --output-headers-file limit_xpaths_file_3.txt - # Compare result to golden copy: - # test_files/expected/limit_xpaths_file_success_3.txt - ( - str(expected_dir / 'limit_xpaths_file_success_3.txt'), - None, 'limit_xpaths_file_3.txt', - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_2.xml')), - str(labels_dir.name / Path('tester_label_3.xml')), - '--limit-xpaths-file', - str(samples_dir / 'element_3.txt') - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_4.txt --output-headers-file limit_xpaths_file_4.txt - # Compare result to golden copy: - # test_files/expected/limit_xpaths_file_success_4.txt - ( - str(expected_dir / 'limit_xpaths_file_success_4.txt'), - None, 'limit_xpaths_file_4.txt', - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - str(labels_dir.name / Path('tester_label_2.xml')), - str(labels_dir.name / Path('tester_label_3.xml')), - '--limit-xpaths-file', - str(samples_dir / 'element_4.txt') - ] - ), - - # Testing --simplify-xpaths - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --simplify-xpaths --output-headers-file simplify_xpaths_1.txt - # Compare result to golden copy: - # test_files/expected/simplify_xpaths_success_1.txt - ( - str(expected_dir / 'simplify_xpaths_success_1.txt'), - None, 'simplify_xpaths_1.txt', - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - '--simplify-xpaths' - ] - ), - - # Testing --simplify-xpaths - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --simplify-xpaths --limit-xpaths-file ../test_files/samples/elements_xpath_simplify_2.txt --output-headers-file simplify_xpaths_2.txt - # Compare result to golden copy: - # test_files/expected/simplify_xpaths_success_2.txt - ( - str(expected_dir / 'simplify_xpaths_success_2.txt'), - None, 'simplify_xpaths_2.txt', - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - str(labels_dir.name / Path('tester_label_2.xml')), - str(labels_dir.name / Path('tester_label_3.xml')), - '--simplify-xpaths', - '--limit-xpaths-file', - str(samples_dir / 'elements_xpath_simplify_2.txt') - ] - ), - - # Testing --simplify-xpaths - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_2.xml" --simplify-xpaths --limit-xpaths-file ../test_files/samples/elements_xpath_simplify_3.txt --output-headers-file simplify_xpaths_3.txt - # Compare result to golden copy: - # test_files/expected/simplify_xpaths_success_3.txt - ( - str(expected_dir / 'simplify_xpaths_success_3.txt'), - None, 'simplify_xpaths_3.txt', - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_2.xml')), - '--simplify-xpaths', - '--limit-xpaths-file', - str(samples_dir / 'elements_xpath_simplify_3.txt') - ] - ), - - # Testing --simplify-xpaths - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_3.xml" --simplify-xpaths --limit-xpaths-file ../test_files/samples/elements_xpath_simplify_4.txt --output-headers-file simplify_xpaths_4.txt - # Compare result to golden copy: - # test_files/expected/simplify_xpaths_success_4.txt - ( - str(expected_dir / 'simplify_xpaths_success_4.txt'), - None, 'simplify_xpaths_4.txt', - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_3.xml')), - '--simplify-xpaths', - '--limit-xpaths-file', - str(samples_dir / 'elements_xpath_simplify_4.txt') - ] - ), - - # Testing --add-extra-file-info - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_2.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --add-extra-file-info filename,filepath --output-index-file extra_file_info_1.csv - # Compare result to golden copy: - # test_files/expected/extra_file_info_success_1.csv - ( - str(expected_dir / 'extra_file_info_success_1.csv'), - 'extra_file_info_1.csv', None, - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_2.xml')), - '--limit-xpaths-file', - str(samples_dir / 'element_extra_file_info.txt'), - '--add-extra-file-info', - 'filename,filepath', - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_5.txt --add-extra-file-info filename --sort-by filename - # --output-index-file extra_file_info_2.csv - # Compare result to golden copy: - # test_files/expected/extra_file_info_success_2.csv - ( - str(expected_dir / 'extra_file_info_success_2.csv'), - 'extra_file_info_2.csv', None, - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - str(labels_dir.name / Path('tester_label_2.xml')), - str(labels_dir.name / Path('tester_label_3.xml')), - '--limit-xpaths-file', - str(samples_dir / 'element_5.txt'), - '--add-extra-file-info', - 'filename', - '--sort-by', - 'filename' - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_5.txt --add-extra-file-info filename,filepath,lid,bundle,bundle_lid --sort-by filename --output-index-file extra_file_info_3.csv - # Compare result to golden copy: - # test_files/expected/extra_file_info_success_3.csv - ( - str(expected_dir / 'extra_file_info_success_3.csv'), - 'extra_file_info_3.csv', None, - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - str(labels_dir.name / Path('tester_label_2.xml')), - str(labels_dir.name / Path('tester_label_3.xml')), - '--limit-xpaths-file', - str(samples_dir / 'element_5.txt'), - '--add-extra-file-info', - 'filename,filepath,lid,bundle,bundle_lid', - '--sort-by', - 'filename' - ] - ), - - # Testing --clean-header-field-names - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --clean-header-field-names --output-headers-file clean_header_field_names_1.txt - # Compare result to golden copy: - # test_files/expected/clean_header_field_names_success_1.txt - ( - str(expected_dir / 'clean_header_field_names_success_1.txt'), - None, 'clean_header_field_names_1.txt', - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - '--clean-header-field-names' - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/elements_clean_header_field_names.txt --clean-header-field-names --output-headers-file clean_header_field_names_2.txt - # Compare result to golden copy: - # test_files/expected/clean_header_field_names_success_2.txt - ( - str(expected_dir / 'clean_header_field_names_success_2.csv'), - 'clean_header_field_names_2.csv', None, - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - '--clean-header-field-names' - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/elements_clean_header_field_names.txt --clean-header-field-names --output-headers-file clean_header_field_names_2.txt - # Compare result to golden copy: - # test_files/expected/clean_header_field_names_success_2.txt - ( - str(expected_dir / 'clean_header_field_names_success_2.txt'), - None, 'clean_header_field_names_2.txt', - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - str(labels_dir.name / Path('tester_label_2.xml')), - '--limit-xpaths-file', - str(samples_dir / 'elements_clean_header_field_names.txt'), - '--clean-header-field-names' - ] - ), - - # Testing --sort by - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/elements_clean_header_field_names.txt --sort-by 'pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1>' --output-index-file sort_by_1.csv - # Compare result to golden copy: - # test_files/expected/sort_by_success_1.csv - ( - str(expected_dir / 'sort_by_success_1.csv'), - 'sort_by_1.csv', None, - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - str(labels_dir.name / Path('tester_label_2.xml')), - str(labels_dir.name / Path('tester_label_3.xml')), - '--limit-xpaths-file', - str(samples_dir / 'elements_clean_header_field_names.txt'), - '--sort-by', - 'pds:Product_Observational/pds:Identification_Area<1>/' - 'pds:logical_identifier<1>' - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/elements_clean_header_field_names.txt --add-extra-file-info bundle_lid,filepath --sort-by bundle_lid --output-index-file sort_by_2.csv - # Compare result to golden copy: - # test_files/expected/sort_by_success_2.csv - ( - str(expected_dir / 'sort_by_success_2.csv'), - 'sort_by_2.csv', None, - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - str(labels_dir.name / Path('tester_label_2.xml')), - str(labels_dir.name / Path('tester_label_3.xml')), - '--limit-xpaths-file', - str(samples_dir / 'elements_clean_header_field_names.txt'), - '--add-extra-file-info', - 'bundle_lid,filepath', - '--sort-by', - 'bundle_lid' - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "identical_label_*.xml" --limit-xpaths-file ../test_files/samples/identical_elements.txt --add-extra-file-info filename --sort-by filename --output-index-file identical_labels.csv - # Compare result to golden copy: - # test_files/expected/identical_labels_success.csv - ( - str(expected_dir / 'identical_labels_success.csv'), - 'identical_labels.csv', None, - [ - str(test_files_dir), - str(labels_dir.name / Path('identical_label_*.xml')), - '--limit-xpaths-file', - str(samples_dir / 'identical_elements.txt'), - '--add-extra-file-info', - 'filename', - '--sort-by', - 'filename' - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "nilled_label.xml" --limit-xpaths-file ../test_files/samples/elements_nilled.txt --output-index-file nilled_elements.csv - # Compare result to golden copy: - # test_files/expected/nilled_element_success.csv - ( - str(expected_dir / 'nilled_element_success.csv'), - 'nilled_element.csv', None, - [ - str(test_files_dir), - str(labels_dir.name / Path('nilled_label.xml')), - '--limit-xpaths-file', - str(samples_dir / 'elements_nilled.txt') - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --fixed-width --output-index-file fixed_width.csv - # Compare result to golden copy: - # test_files/expected/fixed_width_success.csv - ( - str(expected_dir / 'fixed_width_success.csv'), - 'fixed_width.csv', None, - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - '--fixed-width' - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary --config ../test_files/expected/tester_config.yaml --output-index-file generated_label_1.csv - # Compare result to golden copy: - # test_files/expected/label_success_1.csv - # test_files/expected/label_success_1.xml - ( - str(expected_dir / 'label_success_1.csv'), - 'generated_label_1.csv', None, - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - '--generate-label', - 'ancillary', - '--config', - str(expected_dir / 'tester_config.yaml') - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label metadata --fixed-width --output-index-file generated_label_2.csv --config ../test_files/expected/tester_config.yaml --output-index-file generated_label_2.csv - # Compare result to golden copy: - # test_files/expected/label_success_2.csv - # test_files/expected/label_success_2.xml - ( - str(expected_dir / 'label_success_2.csv'), - 'generated_label_2.csv', None, - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - '--generate-label', - 'metadata', - '--fixed-width', - '--config', - str(expected_dir / 'tester_config.yaml') - ] - ), - - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_5.txt --add-extra-file-info filename,filepath,lid,bundle,bundle_lid --generate-label ancillary --config ../test_files/expected/tester_config.yaml --output-index-file generated_label_3.csv - # Compare result to golden copy: - # test_files/expected/label_success_3.csv - # test_files/expected/label_success_3.xml - ( - str(expected_dir / 'label_success_3.csv'), - 'generated_label_3.csv', None, - [ - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - str(labels_dir.name / Path('tester_label_2.xml')), - str(labels_dir.name / Path('tester_label_3.xml')), - '--limit-xpaths-file', - str(samples_dir / 'element_5.txt'), - '--add-extra-file-info', - 'filename,filepath,lid,bundle,bundle_lid', - '--sort-by', - 'filename', - '--generate-label', - 'ancillary', - '--config', - str(expected_dir / 'tester_config.yaml') - ] - ) - ] - ) -def test_success(golden_file, new_file_index, new_file_headers, cmd_line): + 'GOLDEN_FILE,NEW_FILE_INDEX,NEW_FILE_HEADERS,CMD_LINE', + [ + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" + ( + str(EXPECTED_DIR / 'index_file_success.csv'), + None, None, + [] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary + ( + str(EXPECTED_DIR / 'index_file_success.csv'), + None, None, + [ + '--generate-label', + 'ancillary' + ] + ), + + # Testing --limit-xpaths-file with two outputs + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --output-headers-file limit_xpaths_file.txt --output-index-file limit_xpaths_file.csv + # Compare result to golden copy: + # test_files/expected/limit_xpaths_file_success_1.txt + ( + str(EXPECTED_DIR / 'limit_xpaths_file_success_1.csv'), + 'limit_xpaths_file.csv', 'limit_xpaths_file.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'element_1.txt') + ] + ), + + # Testing --limit-xpaths-file + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --output-headers-file limit_xpaths_file.txt + # Compare result to golden copy: + # test_files/expected/limit_xpaths_file_success_1.txt + ( + str(EXPECTED_DIR / 'limit_xpaths_file_success_1.txt'), + None, 'limit_xpaths_file.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'element_1.txt') + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --output-headers-file limit_xpaths_file.txt + # Compare result to golden copy: + # test_files/expected/limit_xpaths_file_success_1.txt + ( + str(EXPECTED_DIR / 'limit_xpaths_file_success_1.txt'), + None, 'limit_xpaths_file_wack.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + LABEL_NAME + '/nonexistent.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'element_1.txt') + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_2.xml" --limit-xpaths-file ../test_files/samples/element_2.txt --output-headers-file limit_xpaths_file_2.txt + # Compare result to golden copy: + # test_files/expected/limit_xpaths_file_success_2.txt + ( + str(EXPECTED_DIR / 'limit_xpaths_file_success_2.txt'), + None, 'limit_xpaths_file_2.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_2.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'element_2.txt') + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_2.xml" --limit-xpaths-file ../test_files/samples/element_duplicates.txt --output-headers-file elements_dupe_file_2.txt + # Compare result to golden copy: + # test_files/expected/limit_xpaths_file_success_2.txt + ( + str(EXPECTED_DIR / 'limit_xpaths_file_success_2.txt'), + None, 'elements_dupe_file_2.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_2.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'element_duplicates.txt') + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_2.xml" tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_3.txt --output-headers-file limit_xpaths_file_3.txt + # Compare result to golden copy: + # test_files/expected/limit_xpaths_file_success_3.txt + ( + str(EXPECTED_DIR / 'limit_xpaths_file_success_3.txt'), + None, 'limit_xpaths_file_3.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_2.xml', + LABEL_NAME + '/tester_label_3.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'element_3.txt') + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_4.txt --output-headers-file limit_xpaths_file_4.txt + # Compare result to golden copy: + # test_files/expected/limit_xpaths_file_success_4.txt + ( + str(EXPECTED_DIR / 'limit_xpaths_file_success_4.txt'), + None, 'limit_xpaths_file_4.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + LABEL_NAME + '/tester_label_2.xml', + LABEL_NAME + '/tester_label_3.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'element_4.txt') + ] + ), + + # Testing --simplify-xpaths + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --simplify-xpaths --output-headers-file simplify_xpaths_1.txt + # Compare result to golden copy: + # test_files/expected/simplify_xpaths_success_1.txt + ( + str(EXPECTED_DIR / 'simplify_xpaths_success_1.txt'), + None, 'simplify_xpaths_1.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + '--simplify-xpaths' + ] + ), + + # Testing --simplify-xpaths + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --simplify-xpaths --limit-xpaths-file ../test_files/samples/elements_xpath_simplify_2.txt --output-headers-file simplify_xpaths_2.txt + # Compare result to golden copy: + # test_files/expected/simplify_xpaths_success_2.txt + ( + str(EXPECTED_DIR / 'simplify_xpaths_success_2.txt'), + None, 'simplify_xpaths_2.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + LABEL_NAME + '/tester_label_2.xml', + LABEL_NAME + '/tester_label_3.xml', + '--simplify-xpaths', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'elements_xpath_simplify_2.txt') + ] + ), + + # Testing --simplify-xpaths + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_2.xml" --simplify-xpaths --limit-xpaths-file ../test_files/samples/elements_xpath_simplify_3.txt --output-headers-file simplify_xpaths_3.txt + # Compare result to golden copy: + # test_files/expected/simplify_xpaths_success_3.txt + ( + str(EXPECTED_DIR / 'simplify_xpaths_success_3.txt'), + None, 'simplify_xpaths_3.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_2.xml', + '--simplify-xpaths', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'elements_xpath_simplify_3.txt') + ] + ), + + # Testing --simplify-xpaths + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_3.xml" --simplify-xpaths --limit-xpaths-file ../test_files/samples/elements_xpath_simplify_4.txt --output-headers-file simplify_xpaths_4.txt + # Compare result to golden copy: + # test_files/expected/simplify_xpaths_success_4.txt + ( + str(EXPECTED_DIR / 'simplify_xpaths_success_4.txt'), + None, 'simplify_xpaths_4.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_3.xml', + '--simplify-xpaths', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'elements_xpath_simplify_4.txt') + ] + ), + + # Testing --add-extra-file-info + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_2.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --add-extra-file-info filename,filepath --output-index-file extra_file_info_1.csv + # Compare result to golden copy: + # test_files/expected/extra_file_info_success_1.csv + ( + str(EXPECTED_DIR / 'extra_file_info_success_1.csv'), + 'extra_file_info_1.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_2.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'element_extra_file_info.txt'), + '--add-extra-file-info', + 'filename,filepath', + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_5.txt --add-extra-file-info filename --sort-by filename + # --output-index-file extra_file_info_2.csv + # Compare result to golden copy: + # test_files/expected/extra_file_info_success_2.csv + ( + str(EXPECTED_DIR / 'extra_file_info_success_2.csv'), + 'extra_file_info_2.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + LABEL_NAME + '/tester_label_2.xml', + LABEL_NAME + '/tester_label_3.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'element_5.txt'), + '--add-extra-file-info', + 'filename', + '--sort-by', + 'filename' + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_5.txt --add-extra-file-info filename,filepath,lid,bundle,bundle_lid --sort-by filename --output-index-file extra_file_info_3.csv + # Compare result to golden copy: + # test_files/expected/extra_file_info_success_3.csv + ( + str(EXPECTED_DIR / 'extra_file_info_success_3.csv'), + 'extra_file_info_3.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + LABEL_NAME + '/tester_label_2.xml', + LABEL_NAME + '/tester_label_3.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'element_5.txt'), + '--add-extra-file-info', + 'filename,filepath,lid,bundle,bundle_lid', + '--sort-by', + 'filename' + ] + ), + + # Testing --clean-header-field-names + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --clean-header-field-names --output-headers-file clean_header_field_names_1.txt + # Compare result to golden copy: + # test_files/expected/clean_header_field_names_success_1.txt + ( + str(EXPECTED_DIR / 'clean_header_field_names_success_1.txt'), + None, 'clean_header_field_names_1.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + '--clean-header-field-names' + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/elements_clean_header_field_names.txt --clean-header-field-names --output-headers-file clean_header_field_names_2.txt + # Compare result to golden copy: + # test_files/expected/clean_header_field_names_success_2.txt + ( + str(EXPECTED_DIR / 'clean_header_field_names_success_2.csv'), + 'clean_header_field_names_2.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + '--clean-header-field-names' + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/elements_clean_header_field_names.txt --clean-header-field-names --output-headers-file clean_header_field_names_2.txt + # Compare result to golden copy: + # test_files/expected/clean_header_field_names_success_2.txt + ( + str(EXPECTED_DIR / 'clean_header_field_names_success_2.txt'), + None, 'clean_header_field_names_2.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + LABEL_NAME + '/tester_label_2.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'elements_clean_header_field_names.txt'), + '--clean-header-field-names' + ] + ), + + # Testing --sort by + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/elements_clean_header_field_names.txt --sort-by 'pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1>' --output-index-file sort_by_1.csv + # Compare result to golden copy: + # test_files/expected/sort_by_success_1.csv + ( + str(EXPECTED_DIR / 'sort_by_success_1.csv'), + 'sort_by_1.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + LABEL_NAME + '/tester_label_2.xml', + LABEL_NAME + '/tester_label_3.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'elements_clean_header_field_names.txt'), + '--sort-by', + 'pds:Product_Observational/pds:Identification_Area<1>/' + 'pds:logical_identifier<1>' + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/elements_clean_header_field_names.txt --add-extra-file-info bundle_lid,filepath --sort-by bundle_lid --output-index-file sort_by_2.csv + # Compare result to golden copy: + # test_files/expected/sort_by_success_2.csv + ( + str(EXPECTED_DIR / 'sort_by_success_2.csv'), + 'sort_by_2.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + LABEL_NAME + '/tester_label_2.xml', + LABEL_NAME + '/tester_label_3.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'elements_clean_header_field_names.txt'), + '--add-extra-file-info', + 'bundle_lid,filepath', + '--sort-by', + 'bundle_lid' + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "identical_label_*.xml" --limit-xpaths-file ../test_files/samples/identical_elements.txt --add-extra-file-info filename --sort-by filename --output-index-file identical_labels.csv + # Compare result to golden copy: + # test_files/expected/identical_labels_success.csv + ( + str(EXPECTED_DIR / 'identical_labels_success.csv'), + 'identical_labels.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/identical_label_*.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'identical_elements.txt'), + '--add-extra-file-info', + 'filename', + '--sort-by', + 'filename' + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "nilled_label.xml" --limit-xpaths-file ../test_files/samples/elements_nilled.txt --output-index-file nilled_elements.csv + # Compare result to golden copy: + # test_files/expected/nilled_element_success.csv + ( + str(EXPECTED_DIR / 'nilled_element_success.csv'), + 'nilled_element.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/nilled_label.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'elements_nilled.txt') + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --fixed-width --output-index-file fixed_width.csv + # Compare result to golden copy: + # test_files/expected/fixed_width_success.csv + ( + str(EXPECTED_DIR / 'fixed_width_success.csv'), + 'fixed_width.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + '--fixed-width' + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary --config ../test_files/expected/tester_config.yaml --output-index-file generated_label_1.csv + # Compare result to golden copy: + # test_files/expected/label_success_1.csv + # test_files/expected/label_success_1.xml + ( + str(EXPECTED_DIR / 'label_success_1.csv'), + 'generated_label_1.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + '--generate-label', + 'ancillary', + '--config', + str(EXPECTED_DIR / 'tester_config.yaml') + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label metadata --fixed-width --output-index-file generated_label_2.csv --config ../test_files/expected/tester_config.yaml --output-index-file generated_label_2.csv + # Compare result to golden copy: + # test_files/expected/label_success_2.csv + # test_files/expected/label_success_2.xml + ( + str(EXPECTED_DIR / 'label_success_2.csv'), + 'generated_label_2.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + '--generate-label', + 'metadata', + '--fixed-width', + '--config', + str(EXPECTED_DIR / 'tester_config.yaml') + ] + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_5.txt --add-extra-file-info filename,filepath,lid,bundle,bundle_lid --generate-label ancillary --config ../test_files/expected/tester_config.yaml --output-index-file generated_label_3.csv + # Compare result to golden copy: + # test_files/expected/label_success_3.csv + # test_files/expected/label_success_3.xml + ( + str(EXPECTED_DIR / 'label_success_3.csv'), + 'generated_label_3.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + LABEL_NAME + '/tester_label_2.xml', + LABEL_NAME + '/tester_label_3.xml', + '--limit-xpaths-file', + str(SAMPLES_DIR / 'element_5.txt'), + '--add-extra-file-info', + 'filename,filepath,lid,bundle,bundle_lid', + '--sort-by', + 'filename', + '--generate-label', + 'ancillary', + '--config', + str(EXPECTED_DIR / 'tester_config.yaml') + ] + ) + ] +) +def test_success(GOLDEN_FILE, NEW_FILE_INDEX, NEW_FILE_HEADERS, CMD_LINE): # Create a temporary directory - with tempfile.TemporaryDirectory(dir=test_files_dir.parent) as temp_dir: + with tempfile.TemporaryDirectory(dir=TEST_FILES_DIR.parent) as temp_dir: temp_dir_path = Path(temp_dir) - if new_file_index is None and new_file_headers is None: - os.chdir(temp_dir_path) - cmd_line.append(str(test_files_dir)) - cmd_line.append(str(labels_dir.name / Path('tester_label_1.xml'))) + if NEW_FILE_INDEX is None and NEW_FILE_HEADERS is None: + shutil.copy(LABELS_DIR / 'tester_label_1.xml', temp_dir_path) + CMD_LINE.append(str(temp_dir_path)) + CMD_LINE.append('tester_label_1.xml') + CMD_LINE.append('--output-index-file') + CMD_LINE.append(str(temp_dir_path / 'index.csv')) # Call main() function with the simulated command line arguments - tools.main(cmd_line) + tools.main(CMD_LINE) path_to_file = temp_dir_path / 'index.csv' - # Assert that the file now exists - assert os.path.isfile(path_to_file) - - # Open and compare the two files - with open(path_to_file, 'r') as created: - formed = created.read() - with open(golden_file, 'r') as new: - expected = new.read() - - assert formed == expected - os.remove(path_to_file) - os.chdir(ROOT_DIR) + compare_files(path_to_file, GOLDEN_FILE) else: # THE PATH TO THE NEW FILE - if new_file_index: - path_to_file = temp_dir_path / new_file_index - cmd_line.append('--output-index-file') - cmd_line.append(str(path_to_file)) + if NEW_FILE_INDEX: + path_to_file = temp_dir_path / NEW_FILE_INDEX + CMD_LINE.append('--output-index-file') + CMD_LINE.append(str(path_to_file)) # Call main() function with the simulated command line arguments - tools.main(cmd_line) - # Assert that the file now exists - assert os.path.isfile(path_to_file) - - # Open and compare the two files - with open(path_to_file, 'r') as created: - formed = created.read() + tools.main(CMD_LINE) - with open(golden_file, 'r') as new: - expected = new.read() + compare_files(path_to_file, GOLDEN_FILE) - assert formed == expected - - if '--generate-label' in cmd_line: + if '--generate-label' in CMD_LINE: label_path = str(path_to_file).replace('.csv', '.xml') - golden_label = str(golden_file).replace('.csv', '.xml') + golden_label = str(GOLDEN_FILE).replace('.csv', '.xml') assert os.path.isfile(label_path) - # Open and compare the two files - with open(label_path, 'r') as created: - formed = created.read() - - with open(golden_label, 'r') as new: - expected = new.read() - - assert formed == expected + compare_files(label_path, golden_label) - if new_file_headers: - path_to_file = temp_dir_path / new_file_headers - golden_file = str(golden_file).replace('.csv', '.txt') - cmd_line.append('--output-headers-file') - cmd_line.append(str(path_to_file)) + if NEW_FILE_HEADERS: + path_to_file = temp_dir_path / NEW_FILE_HEADERS + GOLDEN_FILE = str(GOLDEN_FILE).replace('.csv', '.txt') + CMD_LINE.append('--output-headers-file') + CMD_LINE.append(str(path_to_file)) # Call main() function with the simulated command line arguments - tools.main(cmd_line) - # Assert that the file now exists - assert os.path.isfile(path_to_file) - - # Open and compare the two files - with open(path_to_file, 'r') as created: - formed = created.read() - - with open(golden_file, 'r') as new: - expected = new.read() + tools.main(CMD_LINE) - assert formed == expected + compare_files(path_to_file, GOLDEN_FILE) @pytest.mark.parametrize( - 'cmd_line', + 'CMD_LINE', [ # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --add-extra-file-info bad_element --output-headers-file hdout.txt ( - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), - str(labels_dir.name / Path('tester_label_2.xml')), - str(labels_dir.name / Path('tester_label_3.xml')), + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + LABEL_NAME + '/tester_label_2.xml', + LABEL_NAME + '/tester_label_3.xml', '--limit-xpaths-file', - str(samples_dir / 'element_1.txt'), + str(SAMPLES_DIR / 'element_1.txt'), '--add-extra-file-info', 'bad_element', '--output-headers-file', @@ -549,10 +531,10 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): # Executable command: pds4_create_xml_index ../test_files/labels "bad_directory/labels/tester_label_*.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --add-extra-file-info filename --output-headers-file hdout.txt ( - str(test_files_dir), # directory path + str(TEST_FILES_DIR), # directory path 'bad_directory/labels/tester_label_*.xml', # non-existent directory '--limit-xpaths-file', - str(samples_dir / 'element_1.txt'), # elements file + str(SAMPLES_DIR / 'element_1.txt'), # elements file '--add-extra-file-info', # extra file info 'filename', '--output-headers-file', @@ -561,20 +543,20 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_empty.txt --output-headers-file hdout.txt ( - str(test_files_dir), # directory path - str(labels_dir.name / Path('tester_label_1.xml')), - str(labels_dir.name / Path('tester_label_2.xml')), - str(labels_dir.name / Path('tester_label_3.xml')), + str(TEST_FILES_DIR), # directory path + LABEL_NAME + '/tester_label_1.xml', + LABEL_NAME + '/tester_label_2.xml', + LABEL_NAME + '/tester_label_3.xml', '--limit-xpaths-file', - str(samples_dir / 'element_empty.txt'), # empty elements file + str(SAMPLES_DIR / 'element_empty.txt'), # empty elements file '--output-headers-file', 'hdout.txt' ), # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --simplify-xpaths --sort-by bad_sort --output-headers-file hdout.csv ( - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', '--simplify-xpaths', '--sort-by', 'bad_sort', @@ -584,26 +566,26 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): # Executable command: pds4_create_xml_index ../test_files/labels "nonexistent.xml" --output-headers-file hdout.txt ( - str(test_files_dir), - str(labels_dir.name / Path('nonexistent.xml')), + str(TEST_FILES_DIR), + LABEL_NAME + '/nonexistent.xml', '--output-headers-file', 'hdout.txt', ), # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/elements_xpath_simplify_3.txt --output-headers-file hdout.txt ( - str(test_files_dir), - str(labels_dir.name / Path('tester_label_1.xml')), + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', '--limit-xpaths-file', - str(samples_dir / 'elements_xpath_simplify_3.txt'), + str(SAMPLES_DIR / 'elements_xpath_simplify_3.txt'), '--output-headers-file', 'hdout.txt', ), # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_*.xml" --generate-label ancillary --output-headers-file hdout.txt ( - str(test_files_dir), - str(labels_dir.name / Path('tester_label_*.xml')), + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_*.xml', '--generate-label', 'ancillary', '--output-headers-file', @@ -612,18 +594,18 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): # Executable command: pds4_create_xml_index ../test_files/labels "bad_lid_label.xml" --output-headers-file hdout.txt ( - str(test_files_dir), - str(labels_dir.name / Path('bad_lid_label.xml')), + str(TEST_FILES_DIR), + LABEL_NAME + '/bad_lid_label.xml', '--output-headers-file', 'hdout.txt', ) ] ) -def test_failures(cmd_line): +def test_failures(CMD_LINE): # Call main() function with the simulated command line arguments with pytest.raises(SystemExit) as e: - tools.main(cmd_line) + tools.main(CMD_LINE) assert e.type == SystemExit assert e.value.code != 0 # Check that the exit code indicates failure if os.path.isfile('hdout.txt'): @@ -631,32 +613,32 @@ def test_failures(cmd_line): @pytest.mark.parametrize( - 'new_file,cmd_line', + 'NEW_FILE,CMD_LINE', [ # Executable command: pds4_create_xml_index ../test_files/labels "nilled_label_bad.xml" --limit-xpaths-file ../test_files/samples/elements_nilled_bad.txt --output-index-file indexout.csv ( 'nillable.csv', [ - str(test_files_dir), # directory path - str(labels_dir.name / Path('nilled_label_bad.xml')), + str(TEST_FILES_DIR), # directory path + LABEL_NAME + '/nilled_label_bad.xml', '--limit-xpaths-file', - str(samples_dir / 'elements_nilled_bad.txt'), + str(SAMPLES_DIR / 'elements_nilled_bad.txt'), '--output-index-file' ] ) ] ) -def test_failure_message(capfd, new_file, cmd_line): - with tempfile.TemporaryDirectory(dir=test_files_dir.parent) as temp_dir: +def test_failure_message(capfd, NEW_FILE, CMD_LINE): + with tempfile.TemporaryDirectory(dir=TEST_FILES_DIR.parent) as temp_dir: temp_dir_path = Path(temp_dir) # THE PATH TO THE NEW FILE - path_to_file = temp_dir_path / new_file + path_to_file = temp_dir_path / NEW_FILE # Call main() function with the simulated command line arguments - cmd_line.append(str(path_to_file)) + CMD_LINE.append(str(path_to_file)) # Capture the output - tools.main(cmd_line) + tools.main(CMD_LINE) captured = capfd.readouterr() # Check if the expected statement is printed in stdout or stderr diff --git a/tests/test_pds4_create_xml_index_whitebox.py b/tests/test_pds4_create_xml_index_whitebox.py index ada7110..1fa6027 100644 --- a/tests/test_pds4_create_xml_index_whitebox.py +++ b/tests/test_pds4_create_xml_index_whitebox.py @@ -7,16 +7,15 @@ import pytest import pds4indextools.pds4_create_xml_index as tools import textwrap as _textwrap -from unittest import mock -from unittest.mock import patch +from unittest.mock import patch as PATCH # These two variables are the same for all tests, so we can either declare them as # global variables, or get the ROOT_DIR at the setup stage before running each test ROOT_DIR = Path(__file__).resolve().parent.parent -test_files_dir = ROOT_DIR / 'test_files' -expected_dir = test_files_dir / 'expected' -labels_dir = test_files_dir / 'labels' +TEST_FILES_DIR = ROOT_DIR / 'test_files' +EXPECTED_DIR = TEST_FILES_DIR / 'expected' +LABELS_DIR = TEST_FILES_DIR / 'labels' # Testing load_config_file() @@ -67,7 +66,7 @@ def test_load_config_object(): # Tests that the config_object is loaded over. config_object = tools.load_config_file( - specified_config_files=[str(expected_dir/'tester_config_nillable.yaml'),]) + specified_config_files=[str(EXPECTED_DIR / 'tester_config_nillable.yaml'),]) assert config_object['nillable']['pds:ASCII_Date_YMD']['inapplicable'] == '0001-01-01' assert config_object['nillable']['pds:ASCII_Date_YMD']['missing'] == '0002-01-01' @@ -95,7 +94,7 @@ def test_load_config_object(): # Tests specified configuration files wiht one or the other config_object = tools.load_config_file( - specified_config_files=[str(expected_dir/'tester_config_label.yaml'),]) + specified_config_files=[str(EXPECTED_DIR / 'tester_config_label.yaml'),]) assert config_object['label-contents']['version_id'] == '1.0' assert (config_object['label-contents']['title'] == @@ -103,12 +102,12 @@ def test_load_config_object(): # A bad default config file with pytest.raises(SystemExit): - tools.load_config_file(default_config_file=expected_dir/'non_existent_file.ini') + tools.load_config_file(default_config_file=EXPECTED_DIR / 'non_existent_file.ini') # A bad specified config file with pytest.raises(SystemExit): tools.load_config_file(specified_config_files=list( - str(expected_dir/'non_existent_file.ini'))) + str(EXPECTED_DIR / 'non_existent_file.ini'))) # Testing default_value_for_nil() @@ -201,7 +200,7 @@ def test_split_into_elements(): # Testing process_schema_location() def test_process_schema_location(): label_file = 'tester_label_1.xml' - schema_files = tools.process_schema_location(labels_dir / label_file) + schema_files = tools.process_schema_location(LABELS_DIR / label_file) assert (schema_files[0] == 'https://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1B00.xsd') assert (schema_files[1] == @@ -221,7 +220,7 @@ def test_parse_label_file_exception_handling(capsys): def test_extract_logical_identifier(): label_file = 'tester_label_1.xml' - tree = etree.parse(str(labels_dir / label_file)) + tree = etree.parse(str(LABELS_DIR / label_file)) assert (tools.extract_logical_identifier(tree) == 'urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n') @@ -251,7 +250,7 @@ def test_scrape_namespaces(): def test_get_longest_row_length(): - filename = expected_dir / 'extra_file_info_success_1.csv' + filename = EXPECTED_DIR / 'extra_file_info_success_1.csv' result = tools.get_longest_row_length(filename) assert result == 254 @@ -279,7 +278,7 @@ def create_temp_file(): @pytest.mark.parametrize('platform_name', ['Windows', 'Linux', 'Darwin']) def test_get_creation_date(create_temp_file, platform_name): # Mock platform.system() to simulate different platforms - with mock.patch('platform.system', return_value=platform_name): + with PATCH('platform.system', return_value=platform_name): creation_date = tools.get_creation_date(create_temp_file) assert isinstance(creation_date, str) # Assert that the returned date is in ISO 8601 format @@ -360,7 +359,7 @@ def test_update_nillable_elements_from_xsd_file_with_edge_cases(): tree_duplicate_type = etree.fromstring(xsd_content_duplicate_type) # Mock the download_xsd_file function to return these trees based on input - with mock.patch( + with PATCH( 'pds4indextools.pds4_create_xml_index.download_xsd_file' ) as mock_download: # Define the behavior of the mock for each file @@ -403,7 +402,7 @@ def test_clean_header_field_names(): def test_compute_max_field_lengths(): lengths = tools.compute_max_field_lengths( - str(expected_dir / 'extra_file_info_success_1.csv')) + str(EXPECTED_DIR / 'extra_file_info_success_1.csv')) assert lengths == { 'filename': 18, @@ -416,7 +415,7 @@ def test_compute_max_field_lengths(): # failure with pytest.raises(SystemExit): - lengths = tools.compute_max_field_lengths(str(expected_dir / 'fake_file.csv')) + lengths = tools.compute_max_field_lengths(str(EXPECTED_DIR / 'fake_file.csv')) def test_sort_dataframe_key_error(): @@ -444,7 +443,7 @@ def test_validate_label_type(): assert tools.validate_label_type(arg, valid_choices) == 'Product_Ancillary' -@mock.patch('os.path.exists') +@PATCH('os.path.exists') def test_generate_unique_filename(mock_exists): # Setup the mock to return True for the first two checks and False thereafter mock_exists.side_effect = [True, True, False] @@ -488,9 +487,9 @@ def test_fill_text(): # Assume the get_true_type function is imported from the relevant module. # from pds4indextools.pds4_create_xml_index import get_true_type -@patch('pds4indextools.pds4_create_xml_index.download_xsd_file') -@patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') -@patch('pds4indextools.pds4_create_xml_index.find_base_attribute') +@PATCH('pds4indextools.pds4_create_xml_index.download_xsd_file') +@PATCH('pds4indextools.pds4_create_xml_index.scrape_namespaces') +@PATCH('pds4indextools.pds4_create_xml_index.find_base_attribute') def test_true_type_found_in_first_file(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): # Setup mocks @@ -510,9 +509,9 @@ def test_true_type_found_in_first_file(mock_find_base_attribute, mock_scrape_nam {"mock_namespace": "mock_value"}) -@patch('pds4indextools.pds4_create_xml_index.download_xsd_file') -@patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') -@patch('pds4indextools.pds4_create_xml_index.find_base_attribute') +@PATCH('pds4indextools.pds4_create_xml_index.download_xsd_file') +@PATCH('pds4indextools.pds4_create_xml_index.scrape_namespaces') +@PATCH('pds4indextools.pds4_create_xml_index.find_base_attribute') def test_true_type_found_in_second_file(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): # Setup mocks @@ -539,9 +538,9 @@ def test_true_type_found_in_second_file(mock_find_base_attribute, mock_scrape_na assert mock_find_base_attribute.call_count == 3 -@patch('pds4indextools.pds4_create_xml_index.download_xsd_file') -@patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') -@patch('pds4indextools.pds4_create_xml_index.find_base_attribute') +@PATCH('pds4indextools.pds4_create_xml_index.download_xsd_file') +@PATCH('pds4indextools.pds4_create_xml_index.scrape_namespaces') +@PATCH('pds4indextools.pds4_create_xml_index.find_base_attribute') def test_true_type_found_with_modified_tag(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): @@ -562,9 +561,9 @@ def test_true_type_found_with_modified_tag(mock_find_base_attribute, {"mock_namespace": "mock_value"}) -@patch('pds4indextools.pds4_create_xml_index.download_xsd_file') -@patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') -@patch('pds4indextools.pds4_create_xml_index.find_base_attribute') +@PATCH('pds4indextools.pds4_create_xml_index.download_xsd_file') +@PATCH('pds4indextools.pds4_create_xml_index.scrape_namespaces') +@PATCH('pds4indextools.pds4_create_xml_index.find_base_attribute') def test_true_type_not_found(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): # Setup mocks From d85ce1c3beb4c669899dee003f95da569ab4cf85 Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Mon, 26 Aug 2024 13:44:36 -0700 Subject: [PATCH 10/24] Fixing incorrect capitalization --- tests/test_pds4_create_xml_index_blackbox.py | 62 ++++++++++---------- tests/test_pds4_create_xml_index_whitebox.py | 32 +++++----- 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/tests/test_pds4_create_xml_index_blackbox.py b/tests/test_pds4_create_xml_index_blackbox.py index 949f9eb..75f09e7 100644 --- a/tests/test_pds4_create_xml_index_blackbox.py +++ b/tests/test_pds4_create_xml_index_blackbox.py @@ -31,7 +31,7 @@ def compare_files(path_to_file, golden_file): @pytest.mark.parametrize( - 'GOLDEN_FILE,NEW_FILE_INDEX,NEW_FILE_HEADERS,CMD_LINE', + 'golden_file,new_file_index,new_file_headers,cmd_line', [ # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" ( @@ -465,55 +465,55 @@ def compare_files(path_to_file, golden_file): ) ] ) -def test_success(GOLDEN_FILE, NEW_FILE_INDEX, NEW_FILE_HEADERS, CMD_LINE): +def test_success(golden_file, new_file_index, new_file_headers, cmd_line): # Create a temporary directory with tempfile.TemporaryDirectory(dir=TEST_FILES_DIR.parent) as temp_dir: temp_dir_path = Path(temp_dir) - if NEW_FILE_INDEX is None and NEW_FILE_HEADERS is None: + if new_file_index is None and new_file_headers is None: shutil.copy(LABELS_DIR / 'tester_label_1.xml', temp_dir_path) - CMD_LINE.append(str(temp_dir_path)) - CMD_LINE.append('tester_label_1.xml') - CMD_LINE.append('--output-index-file') - CMD_LINE.append(str(temp_dir_path / 'index.csv')) + cmd_line.append(str(temp_dir_path)) + cmd_line.append('tester_label_1.xml') + cmd_line.append('--output-index-file') + cmd_line.append(str(temp_dir_path / 'index.csv')) # Call main() function with the simulated command line arguments - tools.main(CMD_LINE) + tools.main(cmd_line) path_to_file = temp_dir_path / 'index.csv' - compare_files(path_to_file, GOLDEN_FILE) + compare_files(path_to_file, golden_file) else: # THE PATH TO THE NEW FILE - if NEW_FILE_INDEX: - path_to_file = temp_dir_path / NEW_FILE_INDEX - CMD_LINE.append('--output-index-file') - CMD_LINE.append(str(path_to_file)) + if new_file_index: + path_to_file = temp_dir_path / new_file_index + cmd_line.append('--output-index-file') + cmd_line.append(str(path_to_file)) # Call main() function with the simulated command line arguments - tools.main(CMD_LINE) + tools.main(cmd_line) - compare_files(path_to_file, GOLDEN_FILE) + compare_files(path_to_file, golden_file) - if '--generate-label' in CMD_LINE: + if '--generate-label' in cmd_line: label_path = str(path_to_file).replace('.csv', '.xml') - golden_label = str(GOLDEN_FILE).replace('.csv', '.xml') + golden_label = str(golden_file).replace('.csv', '.xml') assert os.path.isfile(label_path) compare_files(label_path, golden_label) - if NEW_FILE_HEADERS: - path_to_file = temp_dir_path / NEW_FILE_HEADERS - GOLDEN_FILE = str(GOLDEN_FILE).replace('.csv', '.txt') - CMD_LINE.append('--output-headers-file') - CMD_LINE.append(str(path_to_file)) + if new_file_headers: + path_to_file = temp_dir_path / new_file_headers + golden_file = str(golden_file).replace('.csv', '.txt') + cmd_line.append('--output-headers-file') + cmd_line.append(str(path_to_file)) # Call main() function with the simulated command line arguments - tools.main(CMD_LINE) + tools.main(cmd_line) - compare_files(path_to_file, GOLDEN_FILE) + compare_files(path_to_file, golden_file) @pytest.mark.parametrize( - 'CMD_LINE', + 'cmd_line', [ # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --add-extra-file-info bad_element --output-headers-file hdout.txt ( @@ -602,10 +602,10 @@ def test_success(GOLDEN_FILE, NEW_FILE_INDEX, NEW_FILE_HEADERS, CMD_LINE): ] ) -def test_failures(CMD_LINE): +def test_failures(cmd_line): # Call main() function with the simulated command line arguments with pytest.raises(SystemExit) as e: - tools.main(CMD_LINE) + tools.main(cmd_line) assert e.type == SystemExit assert e.value.code != 0 # Check that the exit code indicates failure if os.path.isfile('hdout.txt'): @@ -613,7 +613,7 @@ def test_failures(CMD_LINE): @pytest.mark.parametrize( - 'NEW_FILE,CMD_LINE', + 'NEW_FILE,cmd_line', [ # Executable command: pds4_create_xml_index ../test_files/labels "nilled_label_bad.xml" --limit-xpaths-file ../test_files/samples/elements_nilled_bad.txt --output-index-file indexout.csv ( @@ -628,17 +628,17 @@ def test_failures(CMD_LINE): ) ] ) -def test_failure_message(capfd, NEW_FILE, CMD_LINE): +def test_failure_message(capfd, NEW_FILE, cmd_line): with tempfile.TemporaryDirectory(dir=TEST_FILES_DIR.parent) as temp_dir: temp_dir_path = Path(temp_dir) # THE PATH TO THE NEW FILE path_to_file = temp_dir_path / NEW_FILE # Call main() function with the simulated command line arguments - CMD_LINE.append(str(path_to_file)) + cmd_line.append(str(path_to_file)) # Capture the output - tools.main(CMD_LINE) + tools.main(cmd_line) captured = capfd.readouterr() # Check if the expected statement is printed in stdout or stderr diff --git a/tests/test_pds4_create_xml_index_whitebox.py b/tests/test_pds4_create_xml_index_whitebox.py index 1fa6027..28221fd 100644 --- a/tests/test_pds4_create_xml_index_whitebox.py +++ b/tests/test_pds4_create_xml_index_whitebox.py @@ -7,7 +7,7 @@ import pytest import pds4indextools.pds4_create_xml_index as tools import textwrap as _textwrap -from unittest.mock import patch as PATCH +from unittest.mock import patch # These two variables are the same for all tests, so we can either declare them as @@ -278,7 +278,7 @@ def create_temp_file(): @pytest.mark.parametrize('platform_name', ['Windows', 'Linux', 'Darwin']) def test_get_creation_date(create_temp_file, platform_name): # Mock platform.system() to simulate different platforms - with PATCH('platform.system', return_value=platform_name): + with patch('platform.system', return_value=platform_name): creation_date = tools.get_creation_date(create_temp_file) assert isinstance(creation_date, str) # Assert that the returned date is in ISO 8601 format @@ -359,7 +359,7 @@ def test_update_nillable_elements_from_xsd_file_with_edge_cases(): tree_duplicate_type = etree.fromstring(xsd_content_duplicate_type) # Mock the download_xsd_file function to return these trees based on input - with PATCH( + with patch( 'pds4indextools.pds4_create_xml_index.download_xsd_file' ) as mock_download: # Define the behavior of the mock for each file @@ -443,7 +443,7 @@ def test_validate_label_type(): assert tools.validate_label_type(arg, valid_choices) == 'Product_Ancillary' -@PATCH('os.path.exists') +@patch('os.path.exists') def test_generate_unique_filename(mock_exists): # Setup the mock to return True for the first two checks and False thereafter mock_exists.side_effect = [True, True, False] @@ -487,9 +487,9 @@ def test_fill_text(): # Assume the get_true_type function is imported from the relevant module. # from pds4indextools.pds4_create_xml_index import get_true_type -@PATCH('pds4indextools.pds4_create_xml_index.download_xsd_file') -@PATCH('pds4indextools.pds4_create_xml_index.scrape_namespaces') -@PATCH('pds4indextools.pds4_create_xml_index.find_base_attribute') +@patch('pds4indextools.pds4_create_xml_index.download_xsd_file') +@patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') +@patch('pds4indextools.pds4_create_xml_index.find_base_attribute') def test_true_type_found_in_first_file(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): # Setup mocks @@ -509,9 +509,9 @@ def test_true_type_found_in_first_file(mock_find_base_attribute, mock_scrape_nam {"mock_namespace": "mock_value"}) -@PATCH('pds4indextools.pds4_create_xml_index.download_xsd_file') -@PATCH('pds4indextools.pds4_create_xml_index.scrape_namespaces') -@PATCH('pds4indextools.pds4_create_xml_index.find_base_attribute') +@patch('pds4indextools.pds4_create_xml_index.download_xsd_file') +@patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') +@patch('pds4indextools.pds4_create_xml_index.find_base_attribute') def test_true_type_found_in_second_file(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): # Setup mocks @@ -538,9 +538,9 @@ def test_true_type_found_in_second_file(mock_find_base_attribute, mock_scrape_na assert mock_find_base_attribute.call_count == 3 -@PATCH('pds4indextools.pds4_create_xml_index.download_xsd_file') -@PATCH('pds4indextools.pds4_create_xml_index.scrape_namespaces') -@PATCH('pds4indextools.pds4_create_xml_index.find_base_attribute') +@patch('pds4indextools.pds4_create_xml_index.download_xsd_file') +@patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') +@patch('pds4indextools.pds4_create_xml_index.find_base_attribute') def test_true_type_found_with_modified_tag(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): @@ -561,9 +561,9 @@ def test_true_type_found_with_modified_tag(mock_find_base_attribute, {"mock_namespace": "mock_value"}) -@PATCH('pds4indextools.pds4_create_xml_index.download_xsd_file') -@PATCH('pds4indextools.pds4_create_xml_index.scrape_namespaces') -@PATCH('pds4indextools.pds4_create_xml_index.find_base_attribute') +@patch('pds4indextools.pds4_create_xml_index.download_xsd_file') +@patch('pds4indextools.pds4_create_xml_index.scrape_namespaces') +@patch('pds4indextools.pds4_create_xml_index.find_base_attribute') def test_true_type_not_found(mock_find_base_attribute, mock_scrape_namespaces, mock_download_xsd_file): # Setup mocks From a5a3fb5e3ad87fa3381c61e0041ab3c28dfa17df Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Wed, 28 Aug 2024 13:59:02 -0700 Subject: [PATCH 11/24] Updated config file, cleaned up debugging code --- docs/pds4_create_xml_index.rst | 2 + pds4indextools/index_label_template_pds.xml | 35 ++++---- pds4indextools/pds4_create_xml_index.py | 90 +++++++-------------- test_files/expected/tester_config.yaml | 4 +- 4 files changed, 53 insertions(+), 78 deletions(-) diff --git a/docs/pds4_create_xml_index.rst b/docs/pds4_create_xml_index.rst index d083287..3de7fc6 100644 --- a/docs/pds4_create_xml_index.rst +++ b/docs/pds4_create_xml_index.rst @@ -344,6 +344,8 @@ For reference, provided below are the full contents of the optional label classe doi curating_facility description + File_Area_Ancillary / File_Area_Metadata: + creation_date_time If no new contents are specified for label generation, the label will contain the diff --git a/pds4indextools/index_label_template_pds.xml b/pds4indextools/index_label_template_pds.xml index 982db44..de9e94c 100644 --- a/pds4indextools/index_label_template_pds.xml +++ b/pds4indextools/index_label_template_pds.xml @@ -100,43 +100,42 @@ $END_IF $IF(Product_Ancillary) - $END_IF - $IF(Product_Metadata_Supplemental) + $ELSE $END_IF - $IF(Product_Ancillary and isinstance(Product_Ancillary, dict) and 'File_Area_Ancillary' in Product_Ancillary) - $IF(EXISTS(File)) + $IF(File_Area_Ancillary) $BASENAME(TEMPFILE)$ index-table - $File['creation_date_time']$ - $File['md5_checksum']$ + $IF(creation_date_time) + $File_Area_Ancillary['creation_date_time']$ + $ELSE + $DATETIME(creation_date_time)$ + $END_IF + $FILE_MD5(TEMPFILE)$ - $END_IF - $END_IF - $IF(Product_Metadata_Supplemental and isinstance(Product_Metadata_Supplemental, dict) and 'File_Area_Ancillary' in Product_Metadata_Supplemental) - $IF(File) + $ELSE_IF(File_Area_Metadata) $BASENAME(TEMPFILE)$ index-table - $File['creation_date_time']$ - $File['md5_checksum']$ + $IF(creation_date_time) + $File_Area_Metadata['creation_date_time']$ + $ELSE + $DATETIME(creation_date_time)$ + $END_IF + $FILE_MD5(TEMPFILE)$ - $END_IF - $END_IF + $ELSE $BASENAME(TEMPFILE)$ index-table - $IF(File) - $File['creation_date_time']$ - $ELSE $DATETIME(creation_date_time)$ - $END_IF $FILE_MD5(TEMPFILE)$ + $END_IF
0 $object_length_h$ diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index 34ca2ff..cba197c 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -121,14 +121,15 @@ def correct_duplicates(label_results): tag = key.split('/')[-1].split('<')[0] number = tag.split('_')[-1] if number.isdigit(): - cropped = tag.replace('_'+number, '') + cropped = tag.replace(f'_{number}', '') if cropped in element_names: - if str(cropped+'_'+number+'<1>') in key: - key_new = key.replace((cropped+'_'+number+'<1>'), cropped+'<1>') + if str(f'{cropped}_{number}<1>') in key: + key_new = key.replace((f'{cropped}_{number}<1>'), f'{cropped}<1>') else: - key_new = key.replace(cropped+'_'+number, cropped+'<1>') + key_new = key.replace(f'{cropped}_{number}', f'{cropped}<1>') parent = key_new.split('/')[-2].split('<')[0] - key_new = key_new.replace(parent+'<1>', parent+'<'+str(int(number)+1)+'>') + key_new = key_new.replace(f'{parent}<1>', + f'{parent}<{str(int(number)+1)}>') label_results[key_new] = label_results.pop(key) element_names.add(tag) @@ -301,7 +302,7 @@ def filter_dict_by_glob_patterns(input_dict, glob_patterns, valid_add_extra_file def get_true_type(xsd_files, tag, namespaces): """ - Determines the true type of a specified tag by searching through a list of XSD files. + Returns the true type of a specified tag by searching through a list of XSD files. This function iterates through the provided list of XSD files and attempts to find the "true type" of the given XML tag by examining its attributes and base types. If the @@ -446,7 +447,7 @@ def process_headers(label_results, key, root, namespaces, prefixes): label_results[key_new] = label_results.pop(key) -def renumber_xpaths(xpaths, args): +def renumber_xpaths(xpaths): """ Renumber a list of XPaths to be sequential at each level. @@ -498,7 +499,6 @@ def renumber_xpaths(xpaths, args): Parameters: xpaths (list): The list of XPaths or XPath fragments. - args (argparse.Namespace): Arguments parsed from command line using argparse. Returns: @@ -569,7 +569,7 @@ def split_xpath_prefix_and_num(s): # down. children = [x for x in parent_group_list if x.child is not None] if children: - child_map = renumber_xpaths([x.child for x in children], args) + child_map = renumber_xpaths([x.child for x in children]) xpath_map.update( { f'{x.parent}/{x.child}': ( @@ -642,18 +642,7 @@ def store_element_text(element, tree, results_dict, xsd_files, nillable_elements if not parent_check: print(f'Non-nillable element in {label_filename} ' f'has no associated text: {tag}') - true_type = None - for xsd_file in xsd_files: - xsd_tree = download_xsd_file(xsd_file) - namespaces = scrape_namespaces(xsd_tree) - true_type = find_base_attribute(xsd_tree, tag, namespaces) - if not true_type: - modified_tag = tag + "_WO_Units" - true_type = find_base_attribute(xsd_tree, modified_tag, - namespaces) - # if true_type: - # break - + true_type = get_true_type(xsd_files, tag, tree.getroot().nsmap) default = default_value_for_nil(config, true_type, nil_value) results_dict[xpath] = default @@ -725,8 +714,7 @@ def update_nillable_elements_from_xsd_file(xsd_file, nillable_elements_info): # Attempt to find the type definition in the document type_definition_xpath = (f'//xs:simpleType[@name="{type_name}"] | ' f'//xs:complexType[@name="{type_name}"]') - type_definition = tree.xpath( - type_definition_xpath, namespaces=namespace) + type_definition = tree.xpath(type_definition_xpath, namespaces=namespace) if type_definition: # Take the first match @@ -744,8 +732,7 @@ def update_nillable_elements_from_xsd_file(xsd_file, nillable_elements_info): namespaces=namespace) base_type = extension.get('base') - nillable_elements_info[name] = ( - base_type or 'External or built-in type') + nillable_elements_info[name] = base_type or 'External or built-in type' else: # Type definition not found, might be external or built-in type nillable_elements_info[name] = 'External or built-in type' @@ -947,7 +934,7 @@ def sort_dataframe(df, sort_keys): Raises: ValueError: If any of the provided sort keys are not found in the DataFrame, - a `ValueError` is raised with a descriptive error message. + a `ValueError` is raised with a descriptive error message. Example: >>> df = pd.DataFrame({ @@ -961,10 +948,6 @@ def sort_dataframe(df, sort_keys): 2 Charlie 22 0 Alice 25 1 Bob 30 - - Notes: - - The sorting is done in place, so the original DataFrame is modified. - - The function will raise an error if any of the specified sort keys are invalid. """ try: df.sort_values(by=sort_keys, inplace=True) @@ -1299,19 +1282,10 @@ def main(cmd_line=None): for pattern in patterns: files = directory_path.glob(pattern) - - # Create an iterator from the generator - files_iter = iter(files) - - # Use a sentinel object to check if there's any item - sentinel = object() - first_file = next(files_iter, sentinel) - - if first_file is sentinel: + prev_len = len(label_files) + label_files.extend(files) + if len(label_files) == prev_len: print(f"No files found for pattern: {pattern}") - else: - # If not empty, continue processing and include the first file - label_files.extend(itertools.chain([first_file], files_iter)) verboseprint(f'{len(label_files)} matching file(s) found') @@ -1370,10 +1344,10 @@ def main(cmd_line=None): traverse_and_store(root, tree, label_results, xsd_files, nillable_elements_info, config, label_file) - # # The XPath headers in the label_results dictionary are reformatted to - # # improve readability. Each XPath's namespace is replaced with its prefix for - # # faster reference. Duplicate XPaths are made unique to ensure all results are - # # present in the final product. + # The XPath headers in the label_results dictionary are reformatted to + # improve readability. Each XPath's namespace is replaced with its prefix for + # faster reference. Duplicate XPaths are made unique to ensure all results are + # present in the final product. for key in list(label_results): process_headers(label_results, key, root, namespaces, prefixes) @@ -1387,7 +1361,7 @@ def main(cmd_line=None): new_parts = [] for part in parts: if not part.endswith('>') and parts.index(part) != 1: - part = part+'<1>' + part = f'{part}<1>' new_parts.append(part) else: new_parts.append(part) @@ -1402,7 +1376,7 @@ def main(cmd_line=None): # the column refers to. At this stage, duplicate XPaths may exist again due to # the reformatting. These duplicates are corrected to preserve the contents of # each element's value. - xpath_map = renumber_xpaths(label_results, args) + xpath_map = renumber_xpaths(label_results) for old_xpath, new_xpath in xpath_map.items(): label_results[new_xpath] = label_results.pop(old_xpath) @@ -1414,8 +1388,8 @@ def main(cmd_line=None): try: lid = extract_logical_identifier(tree) except AttributeError: - print(f"Label file {label_file} does not have a " - f"logical_identifier attribute.") + print(f'Label file {label_file} does not have a ' + f'logical_identifier attribute.') sys.exit(1) # Attach extra columns if asked for. @@ -1439,20 +1413,19 @@ def main(cmd_line=None): # dictionary will be returned. Glob patterns are processed sequentially, with the # first pattern having the highest priority. - for label_results in all_results: - ind = all_results.index(label_results) + for ind, label_results in enumerate(all_results): label_results_new = filter_dict_by_glob_patterns( label_results, elements_to_scrape, valid_add_extra_file_info, verboseprint) all_results[ind] = label_results_new - if all(len(set(r)) == 0 for r in all_results): + if all(len(r) == 0 for r in all_results): print('No results found: glob pattern(s) excluded all matches.') sys.exit(1) - # # If --simplify-xpaths is used, the XPath headers will be shortened to the - # # element's tag and namespace prefix. This is contingent on the uniqueness of - # # the XPath header; if more than one XPath header shares a tag, a namespace and a - # # predicate value, the XPath header will remain whole. + # If --simplify-xpaths is used, the XPath headers will be shortened to the + # element's tag and namespace prefix. This is contingent on the uniqueness of + # the XPath header; if more than one XPath header shares a tag, a namespace and a + # predicate value, the XPath header will remain whole. if args.simplify_xpaths: headers = {} unique_tags_master = [] @@ -1487,8 +1460,7 @@ def main(cmd_line=None): for tag in unique_tags: unique_tags_master.append(tag) - for label_results in all_results: - ind = all_results.index(label_results) + for ind, label_results in enumerate(all_results): new_label_results = {} for key, value in list(label_results.items()): new_key = headers[key] diff --git a/test_files/expected/tester_config.yaml b/test_files/expected/tester_config.yaml index 7b29fd8..0cd13cc 100644 --- a/test_files/expected/tester_config.yaml +++ b/test_files/expected/tester_config.yaml @@ -19,5 +19,7 @@ nillable: label-contents: version_id: 1.1 - File: + File_Area_Metadata: + creation_date_time: '0001-01-01T00:00:00.00Z' + File_Area_Ancillary: creation_date_time: '0001-01-01T00:00:00.00Z' From f4d745b8b9539eb26fa67e0133f4fbf5ec4e9cdc Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Wed, 28 Aug 2024 14:16:15 -0700 Subject: [PATCH 12/24] Updated label template with statements --- pds4indextools/index_label_template_pds.xml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pds4indextools/index_label_template_pds.xml b/pds4indextools/index_label_template_pds.xml index de9e94c..cad2abe 100644 --- a/pds4indextools/index_label_template_pds.xml +++ b/pds4indextools/index_label_template_pds.xml @@ -191,13 +191,11 @@ $END_IF $END_IF $IF(Product_Ancillary) - $END_IF - $IF(Product_Metadata_Supplemental) + $ELSE $END_IF $IF(Product_Ancillary) -$END_IF -$IF(Product_Metadata_Supplemental) +$ELSE $END_IF From 11430816761956b5d6c084c4b8b73eb0f3e4d39e Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Thu, 29 Aug 2024 10:35:07 -0700 Subject: [PATCH 13/24] Fixed duplicate scraped label issue caused by generalized glob patterns --- pds4indextools/pds4_create_xml_index.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index cba197c..a26c4fb 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -1263,7 +1263,7 @@ def main(cmd_line=None): # will determine which files will be scraped for. nillable_elements_info = {} - label_files = [] + collected_files = set() all_results = [] tags = [] xsd_files = [] @@ -1282,13 +1282,15 @@ def main(cmd_line=None): for pattern in patterns: files = directory_path.glob(pattern) - prev_len = len(label_files) - label_files.extend(files) - if len(label_files) == prev_len: + prev_len = len(collected_files) + collected_files.update(files) + if len(collected_files) == prev_len: print(f"No files found for pattern: {pattern}") - verboseprint(f'{len(label_files)} matching file(s) found') + verboseprint(f'{len(collected_files)} matching file(s) found') + label_files = list(collected_files) + label_files.sort() if label_files == []: print(f'No files matching any patterns found in directory: {directory_path}') sys.exit(1) From 2fb7bab16d277a3690c0cca58b9bbd608ed4470d Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Thu, 29 Aug 2024 11:00:06 -0700 Subject: [PATCH 14/24] Got unit test coverage back up to 100% --- pds4indextools/pds4_create_xml_index.py | 6 ++++-- tests/test_pds4_create_xml_index_blackbox.py | 8 +++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index a26c4fb..5b4c26b 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -893,13 +893,15 @@ def get_base_type(query): f"/*[local-name()='extension']/*/*/*/@base" ] + base_type = None for query in queries: result = get_base_type(query) if result: base_type = result[0] - return base_type + else: + continue - return None + return base_type def scrape_namespaces(tree): diff --git a/tests/test_pds4_create_xml_index_blackbox.py b/tests/test_pds4_create_xml_index_blackbox.py index 75f09e7..14afe97 100644 --- a/tests/test_pds4_create_xml_index_blackbox.py +++ b/tests/test_pds4_create_xml_index_blackbox.py @@ -471,17 +471,15 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): temp_dir_path = Path(temp_dir) if new_file_index is None and new_file_headers is None: - shutil.copy(LABELS_DIR / 'tester_label_1.xml', temp_dir_path) - cmd_line.append(str(temp_dir_path)) + cmd_line.append(str(LABELS_DIR)) cmd_line.append('tester_label_1.xml') - cmd_line.append('--output-index-file') - cmd_line.append(str(temp_dir_path / 'index.csv')) # Call main() function with the simulated command line arguments tools.main(cmd_line) - path_to_file = temp_dir_path / 'index.csv' + path_to_file = ROOT_DIR / 'index.csv' compare_files(path_to_file, golden_file) + os.remove(path_to_file) else: # THE PATH TO THE NEW FILE From 9f6bfc5087eaa9f79a28117165c508d935de0e0f Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Thu, 29 Aug 2024 14:44:11 -0700 Subject: [PATCH 15/24] Making flake8 compliant --- tests/test_pds4_create_xml_index_blackbox.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_pds4_create_xml_index_blackbox.py b/tests/test_pds4_create_xml_index_blackbox.py index 14afe97..ad5fabc 100644 --- a/tests/test_pds4_create_xml_index_blackbox.py +++ b/tests/test_pds4_create_xml_index_blackbox.py @@ -2,7 +2,6 @@ import pytest import os import tempfile -import shutil import pds4indextools.pds4_create_xml_index as tools From a138f7cf35849e083e7769ffe63c7c3990ffb4fa Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Wed, 4 Sep 2024 16:05:56 -0700 Subject: [PATCH 16/24] Making changes according to pull request --- pds4indextools/default_config.yaml | 2 ++ pds4indextools/index_label_template_pds.xml | 14 +++++++------- pds4indextools/pds4_create_xml_index.py | 2 +- test_files/expected/label_success_2.xml | 2 +- .../{expected => samples}/tester_config.yaml | 2 +- .../{expected => samples}/tester_config_label.yaml | 0 .../tester_config_nillable.yaml | 0 tests/test_pds4_create_xml_index_blackbox.py | 12 ++++++------ tests/test_pds4_create_xml_index_whitebox.py | 5 +++-- 9 files changed, 21 insertions(+), 18 deletions(-) rename test_files/{expected => samples}/tester_config.yaml (91%) rename test_files/{expected => samples}/tester_config_label.yaml (100%) rename test_files/{expected => samples}/tester_config_nillable.yaml (100%) diff --git a/pds4indextools/default_config.yaml b/pds4indextools/default_config.yaml index a2a5a48..b23fc05 100644 --- a/pds4indextools/default_config.yaml +++ b/pds4indextools/default_config.yaml @@ -44,3 +44,5 @@ label-contents: External_Reference: Source_Product_Internal: Source_Product_External: + File_Area_Ancillary: + File_Area_Metadata: diff --git a/pds4indextools/index_label_template_pds.xml b/pds4indextools/index_label_template_pds.xml index cad2abe..fdc59aa 100644 --- a/pds4indextools/index_label_template_pds.xml +++ b/pds4indextools/index_label_template_pds.xml @@ -103,26 +103,26 @@ $END_IF $ELSE $END_IF - $IF(File_Area_Ancillary) + $IF(Product_Ancillary and File_Area_Ancillary) $BASENAME(TEMPFILE)$ index-table - $IF(creation_date_time) + $IF(File_Area_Ancillary['creation_date_time']) $File_Area_Ancillary['creation_date_time']$ $ELSE - $DATETIME(creation_date_time)$ + $DATETIME(calculated_creation_date_time)$ $END_IF $FILE_MD5(TEMPFILE)$ - $ELSE_IF(File_Area_Metadata) + $ELSE_IF(Product_Metadata_Supplemental and File_Area_Metadata) $BASENAME(TEMPFILE)$ index-table - $IF(creation_date_time) + $IF(File_Area_Metadata['creation_date_time']) $File_Area_Metadata['creation_date_time']$ $ELSE - $DATETIME(creation_date_time)$ + $DATETIME(calculated_creation_date_time)$ $END_IF $FILE_MD5(TEMPFILE)$ @@ -131,7 +131,7 @@ $END_IF $BASENAME(TEMPFILE)$ index-table - $DATETIME(creation_date_time)$ + $DATETIME(calculated_creation_date_time)$ $FILE_MD5(TEMPFILE)$ diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index 5b4c26b..1be5600 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -1587,7 +1587,7 @@ def main(cmd_line=None): # .yaml file from --config-file label_content = { 'logical_identifier': 'urn:nasa:pds:rms_metadata:document_opus:' + filename, - 'creation_date_time': str(creation_date), + 'calculated_creation_date_time': str(creation_date), 'TEMPFILE': index_file, 'Field_Content': header_info, 'fields': len(header_info), diff --git a/test_files/expected/label_success_2.xml b/test_files/expected/label_success_2.xml index 628ae41..23a5758 100644 --- a/test_files/expected/label_success_2.xml +++ b/test_files/expected/label_success_2.xml @@ -26,7 +26,7 @@ generated_label_2.csv index-table - 0001-01-01T00:00:00.00Z + 0002-02-02T00:00:00.00Z 53d47b320936ac3fbba0852696065418 diff --git a/test_files/expected/tester_config.yaml b/test_files/samples/tester_config.yaml similarity index 91% rename from test_files/expected/tester_config.yaml rename to test_files/samples/tester_config.yaml index 0cd13cc..9097074 100644 --- a/test_files/expected/tester_config.yaml +++ b/test_files/samples/tester_config.yaml @@ -20,6 +20,6 @@ nillable: label-contents: version_id: 1.1 File_Area_Metadata: - creation_date_time: '0001-01-01T00:00:00.00Z' + creation_date_time: '0002-02-02T00:00:00.00Z' File_Area_Ancillary: creation_date_time: '0001-01-01T00:00:00.00Z' diff --git a/test_files/expected/tester_config_label.yaml b/test_files/samples/tester_config_label.yaml similarity index 100% rename from test_files/expected/tester_config_label.yaml rename to test_files/samples/tester_config_label.yaml diff --git a/test_files/expected/tester_config_nillable.yaml b/test_files/samples/tester_config_nillable.yaml similarity index 100% rename from test_files/expected/tester_config_nillable.yaml rename to test_files/samples/tester_config_nillable.yaml diff --git a/tests/test_pds4_create_xml_index_blackbox.py b/tests/test_pds4_create_xml_index_blackbox.py index ad5fabc..aa8dff5 100644 --- a/tests/test_pds4_create_xml_index_blackbox.py +++ b/tests/test_pds4_create_xml_index_blackbox.py @@ -403,7 +403,7 @@ def compare_files(path_to_file, golden_file): ] ), - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary --config ../test_files/expected/tester_config.yaml --output-index-file generated_label_1.csv + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary --config ../test_files/samples/tester_config.yaml --output-index-file generated_label_1.csv # Compare result to golden copy: # test_files/expected/label_success_1.csv # test_files/expected/label_success_1.xml @@ -416,11 +416,11 @@ def compare_files(path_to_file, golden_file): '--generate-label', 'ancillary', '--config', - str(EXPECTED_DIR / 'tester_config.yaml') + str(SAMPLES_DIR / 'tester_config.yaml') ] ), - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label metadata --fixed-width --output-index-file generated_label_2.csv --config ../test_files/expected/tester_config.yaml --output-index-file generated_label_2.csv + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label metadata --fixed-width --output-index-file generated_label_2.csv --config ../test_files/samples/tester_config.yaml --output-index-file generated_label_2.csv # Compare result to golden copy: # test_files/expected/label_success_2.csv # test_files/expected/label_success_2.xml @@ -434,11 +434,11 @@ def compare_files(path_to_file, golden_file): 'metadata', '--fixed-width', '--config', - str(EXPECTED_DIR / 'tester_config.yaml') + str(SAMPLES_DIR / 'tester_config.yaml') ] ), - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_5.txt --add-extra-file-info filename,filepath,lid,bundle,bundle_lid --generate-label ancillary --config ../test_files/expected/tester_config.yaml --output-index-file generated_label_3.csv + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_5.txt --add-extra-file-info filename,filepath,lid,bundle,bundle_lid --generate-label ancillary --config ../test_files/samples/tester_config.yaml --output-index-file generated_label_3.csv # Compare result to golden copy: # test_files/expected/label_success_3.csv # test_files/expected/label_success_3.xml @@ -459,7 +459,7 @@ def compare_files(path_to_file, golden_file): '--generate-label', 'ancillary', '--config', - str(EXPECTED_DIR / 'tester_config.yaml') + str(SAMPLES_DIR / 'tester_config.yaml') ] ) ] diff --git a/tests/test_pds4_create_xml_index_whitebox.py b/tests/test_pds4_create_xml_index_whitebox.py index 28221fd..62a8c83 100644 --- a/tests/test_pds4_create_xml_index_whitebox.py +++ b/tests/test_pds4_create_xml_index_whitebox.py @@ -14,6 +14,7 @@ # global variables, or get the ROOT_DIR at the setup stage before running each test ROOT_DIR = Path(__file__).resolve().parent.parent TEST_FILES_DIR = ROOT_DIR / 'test_files' +SAMPLES_DIR = TEST_FILES_DIR / 'samples' EXPECTED_DIR = TEST_FILES_DIR / 'expected' LABELS_DIR = TEST_FILES_DIR / 'labels' @@ -66,7 +67,7 @@ def test_load_config_object(): # Tests that the config_object is loaded over. config_object = tools.load_config_file( - specified_config_files=[str(EXPECTED_DIR / 'tester_config_nillable.yaml'),]) + specified_config_files=[str(SAMPLES_DIR / 'tester_config_nillable.yaml'),]) assert config_object['nillable']['pds:ASCII_Date_YMD']['inapplicable'] == '0001-01-01' assert config_object['nillable']['pds:ASCII_Date_YMD']['missing'] == '0002-01-01' @@ -94,7 +95,7 @@ def test_load_config_object(): # Tests specified configuration files wiht one or the other config_object = tools.load_config_file( - specified_config_files=[str(EXPECTED_DIR / 'tester_config_label.yaml'),]) + specified_config_files=[str(SAMPLES_DIR / 'tester_config_label.yaml'),]) assert config_object['label-contents']['version_id'] == '1.0' assert (config_object['label-contents']['title'] == From 59be059ec5d585719a394db2b720f0f4b7562264 Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Wed, 4 Sep 2024 16:49:32 -0700 Subject: [PATCH 17/24] Adding further implementation in label template --- pds4indextools/index_label_template_pds.xml | 42 ++++++++++----------- pds4indextools/pds4_create_xml_index.py | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pds4indextools/index_label_template_pds.xml b/pds4indextools/index_label_template_pds.xml index fdc59aa..2f8dea3 100644 --- a/pds4indextools/index_label_template_pds.xml +++ b/pds4indextools/index_label_template_pds.xml @@ -62,37 +62,37 @@ $END_IF $IF(Internal_Reference) $FOR(field, k=Internal_Reference) - - - + $field['lid_reference']$ + $field['reference_type']$ + $field['comment']$ $END_FOR $END_IF $IF(External_Reference) $FOR(field, k=External_Reference) - - - + $field['doi']$ + $field['reference_text']$ + $field['description']$ $END_FOR $END_IF $IF(Source_Product_Internal) $FOR(field, k=Source_Product_Internal) - - - + $field['lidvid_reference']$ + $field['reference_type']$ + $field['comment']$ $END_FOR $END_IF $IF(Source_Product_External) $FOR(field, k=Source_Product_External) - - - - + $external_source_product_identifier']$ + $field['reference_type']$ + $field['doi']$ + $field['curating_facility']$ $END_FOR @@ -105,34 +105,34 @@ $END_IF $END_IF $IF(Product_Ancillary and File_Area_Ancillary) - $BASENAME(TEMPFILE)$ + $BASENAME(index_file_name)$ index-table $IF(File_Area_Ancillary['creation_date_time']) $File_Area_Ancillary['creation_date_time']$ $ELSE $DATETIME(calculated_creation_date_time)$ $END_IF - $FILE_MD5(TEMPFILE)$ + $FILE_MD5(index_file_name)$ $ELSE_IF(Product_Metadata_Supplemental and File_Area_Metadata) - $BASENAME(TEMPFILE)$ + $BASENAME(index_file_name)$ index-table $IF(File_Area_Metadata['creation_date_time']) $File_Area_Metadata['creation_date_time']$ $ELSE $DATETIME(calculated_creation_date_time)$ $END_IF - $FILE_MD5(TEMPFILE)$ + $FILE_MD5(index_file_name)$ $ELSE - $BASENAME(TEMPFILE)$ + $BASENAME(index_file_name)$ index-table $DATETIME(calculated_creation_date_time)$ - $FILE_MD5(TEMPFILE)$ + $FILE_MD5(index_file_name)$ $END_IF @@ -146,7 +146,7 @@ $END_IF $object_length_t$ - $FILE_RECORDS(TEMPFILE)$ + $FILE_RECORDS(index_file_name)$ Line-Feed @@ -170,7 +170,7 @@ $END_IF 0 $object_length_t$ PDS DSV 1 - $FILE_RECORDS(TEMPFILE)$ + $FILE_RECORDS(index_file_name)$ Line-Feed Comma diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index 1be5600..9bf6439 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -1588,7 +1588,7 @@ def main(cmd_line=None): label_content = { 'logical_identifier': 'urn:nasa:pds:rms_metadata:document_opus:' + filename, 'calculated_creation_date_time': str(creation_date), - 'TEMPFILE': index_file, + 'index_file_name': index_file, 'Field_Content': header_info, 'fields': len(header_info), 'maximum_record_length': get_longest_row_length(index_file), From 94b43c1c26f0b0a7f2f10c631d6fe0955233d16d Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Thu, 5 Sep 2024 12:05:01 -0700 Subject: [PATCH 18/24] Added unit tests for references in label generation --- pds4indextools/index_label_template_pds.xml | 4 +- .../expected/label_references_success.csv | 2 + .../expected/label_references_success.xml | 158 ++++++++++++++++++ .../samples/tester_config_reference.yaml | 36 ++++ tests/test_pds4_create_xml_index_blackbox.py | 27 ++- 5 files changed, 219 insertions(+), 8 deletions(-) create mode 100644 test_files/expected/label_references_success.csv create mode 100644 test_files/expected/label_references_success.xml create mode 100644 test_files/samples/tester_config_reference.yaml diff --git a/pds4indextools/index_label_template_pds.xml b/pds4indextools/index_label_template_pds.xml index 2f8dea3..5f7aca3 100644 --- a/pds4indextools/index_label_template_pds.xml +++ b/pds4indextools/index_label_template_pds.xml @@ -89,11 +89,11 @@ $END_IF $IF(Source_Product_External) $FOR(field, k=Source_Product_External) - $external_source_product_identifier']$ + $field['external_source_product_identifier']$ $field['reference_type']$ $field['doi']$ $field['curating_facility']$ - + $field['description']$ $END_FOR $END_IF diff --git a/test_files/expected/label_references_success.csv b/test_files/expected/label_references_success.csv new file mode 100644 index 0000000..0eb78a1 --- /dev/null +++ b/test_files/expected/label_references_success.csv @@ -0,0 +1,2 @@ +pds:logical_identifier<1>,pds:version_id<1>,pds:title<1>,pds:information_model_version<1>,pds:Product_Observational/pds:Observing_System<1>/pds:name<1>,pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:name<1>,pds:type<1>,pds:lid_reference<1>,pds:reference_type<1> +urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n,1.0,Cassini ISS Image 1455200455n.img,1.11.0.0,Cassini Orbiter Imaging Science Subsystem,Cassini Orbiter,Spacecraft,urn:nasa:pds:context:instrument_host:spacecraft.co,is_instrument_host diff --git a/test_files/expected/label_references_success.xml b/test_files/expected/label_references_success.xml new file mode 100644 index 0000000..51e978f --- /dev/null +++ b/test_files/expected/label_references_success.xml @@ -0,0 +1,158 @@ + + + + + + urn:nasa:pds:rms_metadata:document_opus:label_references + 1.0 + Index file for my occultation bundle + 1.21.0.0 + Product_Ancillary + + + 2024-01-01 + 1.1 + This is a lengthy description of what this modification +changed in the bundle. +There were lots of changes. + + + + 2023-01-01 + 1. + Initial release. + + + + Creative Common Public License CC0 1.0 (2024) + Creative Commons Zero (CC0) license information. + + urn:nasa:pds:system_bundle:document_pds4_standards:creative_commons_1.0.0::1.0 + product_to_license + + + + + + urn:nasa:pds:cassini_iss_cruise:data_raw:body-geometry + data_to_resource + The index table of body surface geometry information associated with each observation. + + + urn:nasa:pds:cassini_iss_cruise:body-inventory + data_to_resource + An index listing every Saturn system body expected to fall within each field of view. + + + 10.1086/113662 + Elliot et al. (1984). "Structure of the Uranian rings. I. Square-well model and particle-size constraints" Astron J. 89, 1587-1603. + reference material + + + urn:nasa:pds:insight-ifg-mars:data-ifg-raw:ifg-raw-sol0014-20181211t021721-20181211t150435-pt2hz::5.0 + data_to_raw_source_product + Raw data used in processing + + + CO-S-UVIS-2-CUBE-V1.4:COUVIS_0056/DATA/D2016_245/EUV2016_245_17_49 + data_to_raw_source_product + None + PDS RMS Node + The original PDS3 version of this product. The form of the reference is dataset_id:volume_id:directory_path:file_name. + + + + + label_references.csv + index-table + 0001-01-01T00:00:00.00Z + 85e4697006ea9a54e7eafa8cf4b9bb40 + + +
+ 0 + 303 + UTF-8 Text + Provides the column headers, separated by commas, for the data table. +
+ + 0 + 542 + PDS DSV 1 + 2 + Line-Feed + Comma + + 9 + 0 + 302 + + pds:logical_identifier<1> + 1 + ASCII_Short_String_Collapsed + 52 + + + + pds:version_id<1> + 2 + ASCII_Short_String_Collapsed + 3 + + + + pds:title<1> + 3 + ASCII_Short_String_Collapsed + 33 + + + + pds:information_model_version<1> + 4 + ASCII_Short_String_Collapsed + 8 + + + + pds:Product_Observational/pds:Observing_System<1>/pds:name<1> + 5 + UTF8_Short_String_Collapsed + 41 + + + + pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:name<1> + 6 + UTF8_Short_String_Collapsed + 15 + + + + pds:type<1> + 7 + ASCII_Short_String_Collapsed + 10 + + + + pds:lid_reference<1> + 8 + ASCII_LID + 50 + + + + pds:reference_type<1> + 9 + ASCII_Short_String_Collapsed + 18 + + + + +
+
diff --git a/test_files/samples/tester_config_reference.yaml b/test_files/samples/tester_config_reference.yaml new file mode 100644 index 0000000..266e540 --- /dev/null +++ b/test_files/samples/tester_config_reference.yaml @@ -0,0 +1,36 @@ + +label-contents: + title: Index file for my occultation bundle + Modification_Detail: + - modification_date: '2024-01-01' + version_id: 1.1 + description: | + This is a lengthy description of what this modification + changed in the bundle. + There were lots of changes. + - modification_date: '2023-01-01' + version_id: 1.0 + description: Initial release. + Internal_Reference: + - lid_reference: urn:nasa:pds:cassini_iss_cruise:data_raw:body-geometry + reference_type: data_to_resource + comment: The index table of body surface geometry information associated with each observation. + - lid_reference: urn:nasa:pds:cassini_iss_cruise:body-inventory + reference_type: data_to_resource + comment: An index listing every Saturn system body expected to fall within each field of view. + External_Reference: + - doi: 10.1086/113662 + reference_text: Elliot et al. (1984). "Structure of the Uranian rings. I. Square-well model and particle-size constraints" Astron J. 89, 1587-1603. + description: reference material + Source_Product_Internal: + - lidvid_reference: urn:nasa:pds:insight-ifg-mars:data-ifg-raw:ifg-raw-sol0014-20181211t021721-20181211t150435-pt2hz::5.0 + reference_type: data_to_raw_source_product + comment: Raw data used in processing + Source_Product_External: + - external_source_product_identifier: CO-S-UVIS-2-CUBE-V1.4:COUVIS_0056/DATA/D2016_245/EUV2016_245_17_49 + reference_type: data_to_raw_source_product + doi: + curating_facility: PDS RMS Node + description: The original PDS3 version of this product. The form of the reference is dataset_id:volume_id:directory_path:file_name. + File_Area_Ancillary: + creation_date_time: '0001-01-01T00:00:00.00Z' \ No newline at end of file diff --git a/tests/test_pds4_create_xml_index_blackbox.py b/tests/test_pds4_create_xml_index_blackbox.py index aa8dff5..0c0b395 100644 --- a/tests/test_pds4_create_xml_index_blackbox.py +++ b/tests/test_pds4_create_xml_index_blackbox.py @@ -49,6 +49,21 @@ def compare_files(path_to_file, golden_file): ] ), + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary --config-file ../test_files/samples/tester_config_reference.yaml --output-index-file label_references_success.csv --simplify-xpaths + ( + str(EXPECTED_DIR / 'label_references_success.csv'), + 'label_references.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/tester_label_1.xml', + '--generate-label', + 'ancillary', + '--config-file', + str(SAMPLES_DIR / 'tester_config_reference.yaml'), + '--simplify-xpaths' + ] + ), + # Testing --limit-xpaths-file with two outputs # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --output-headers-file limit_xpaths_file.txt --output-index-file limit_xpaths_file.csv # Compare result to golden copy: @@ -403,7 +418,7 @@ def compare_files(path_to_file, golden_file): ] ), - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary --config ../test_files/samples/tester_config.yaml --output-index-file generated_label_1.csv + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary --config-file ../test_files/samples/tester_config.yaml --output-index-file generated_label_1.csv # Compare result to golden copy: # test_files/expected/label_success_1.csv # test_files/expected/label_success_1.xml @@ -415,12 +430,12 @@ def compare_files(path_to_file, golden_file): LABEL_NAME + '/tester_label_1.xml', '--generate-label', 'ancillary', - '--config', + '--config-file', str(SAMPLES_DIR / 'tester_config.yaml') ] ), - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label metadata --fixed-width --output-index-file generated_label_2.csv --config ../test_files/samples/tester_config.yaml --output-index-file generated_label_2.csv + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label metadata --fixed-width --output-index-file generated_label_2.csv --config-file ../test_files/samples/tester_config.yaml --output-index-file generated_label_2.csv # Compare result to golden copy: # test_files/expected/label_success_2.csv # test_files/expected/label_success_2.xml @@ -433,12 +448,12 @@ def compare_files(path_to_file, golden_file): '--generate-label', 'metadata', '--fixed-width', - '--config', + '--config-file', str(SAMPLES_DIR / 'tester_config.yaml') ] ), - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_5.txt --add-extra-file-info filename,filepath,lid,bundle,bundle_lid --generate-label ancillary --config ../test_files/samples/tester_config.yaml --output-index-file generated_label_3.csv + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" "tester_label_2.xml" "tester_label_3.xml" --limit-xpaths-file ../test_files/samples/element_5.txt --add-extra-file-info filename,filepath,lid,bundle,bundle_lid --generate-label ancillary --config-file ../test_files/samples/tester_config.yaml --output-index-file generated_label_3.csv # Compare result to golden copy: # test_files/expected/label_success_3.csv # test_files/expected/label_success_3.xml @@ -458,7 +473,7 @@ def compare_files(path_to_file, golden_file): 'filename', '--generate-label', 'ancillary', - '--config', + '--config-file', str(SAMPLES_DIR / 'tester_config.yaml') ] ) From 6639527d8daa8f87d2a14a7cedec36f841907f76 Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Thu, 17 Oct 2024 15:04:32 -0700 Subject: [PATCH 19/24] Making changes according to pull request --- docs/pds4_create_xml_index.rst | 65 ++--- pds4indextools/index_label_template_pds.xml | 32 ++- pds4indextools/pds4_create_xml_index.py | 232 ++++++++---------- .../cleaned_headers_label_success.csv | 3 + .../cleaned_headers_label_success.xml | 226 +++++++++++++++++ test_files/expected/label_success_1.xml | 2 - test_files/expected/label_success_2.xml | 2 - test_files/expected/label_success_3.xml | 2 - test_files/expected/nested_label_success.txt | 25 ++ test_files/labels/nested_label.xml | 74 ++++++ test_files/labels/rf-tester-label_1.xml | 66 +++++ test_files/labels/rf-tester-label_2.xml | 50 ++++ test_files/samples/tester_config_label.yaml | 28 ++- tests/test_pds4_create_xml_index_blackbox.py | 34 ++- tests/test_pds4_create_xml_index_whitebox.py | 22 -- 15 files changed, 663 insertions(+), 200 deletions(-) create mode 100644 test_files/expected/cleaned_headers_label_success.csv create mode 100644 test_files/expected/cleaned_headers_label_success.xml create mode 100644 test_files/expected/nested_label_success.txt create mode 100644 test_files/labels/nested_label.xml create mode 100644 test_files/labels/rf-tester-label_1.xml create mode 100644 test_files/labels/rf-tester-label_2.xml diff --git a/docs/pds4_create_xml_index.rst b/docs/pds4_create_xml_index.rst index 3de7fc6..2474ffd 100644 --- a/docs/pds4_create_xml_index.rst +++ b/docs/pds4_create_xml_index.rst @@ -145,9 +145,10 @@ Limiting results - ``--limit-xpaths-file XPATHS_FILEPATH``: Specify a text file containing a list of specific XPaths to extract from the label files. If this argument is not specified, all - elements found in the label files will be included. The given text file can specify - XPaths using ``glob``-style syntax, where each XPath level is treated as if it were a - directory in a filesystem. Available wildcards are: + elements found in the label files will be included. This command uses only the whole + versions of the XPath(s) -- simplified versions are not allowed. The given text file + can specify XPaths using ``glob``-style syntax, where each XPath level is treated as if + it were a directory in a filesystem. Available wildcards are: - ``?`` matches any single character within an XPath level - ``*`` matches any series of characters within an XPath level @@ -302,6 +303,8 @@ Below is the ``label-contents`` section of the default configuration file:: External_Reference: Source_Product_Internal: Source_Product_External: + File_Area_Ancillary: + File_Area_Metadata: Each listed value with an empty dictionary is an optional field the user can include in their generated label. If the user does decide to include one of these fields, **they must @@ -311,41 +314,41 @@ element will remain empty**. For reference, provided below are the full contents of the optional label classes:: Citation_Information: - author_list - editor_list - publication_year - doi - keyword - description + author_list: + editor_list: + publication_year: + doi: + keyword: + description: Funding_Acknowledgement: - funding_source - funding_year - funding_award - funding_acknowledgement_text + funding_source: + funding_year: + funding_award: + funding_acknowledgement_text: Modification_Detail: - modification_date - version_id - description + modification_date: + version_id: + description: Internal_Reference: - lid_reference - reference_type - comment + lid_reference: + reference_type: + comment: External_Reference: - doi - reference_text - description + doi: + reference_text: + description: Source_Product_Internal: - lidvid_reference - reference_type - comment + lidvid_reference: + reference_type: + comment: Source_Product_External: - external_source_product_identifier - reference_type - doi - curating_facility - description + external_source_product_identifier: + reference_type: + doi: + curating_facility: + description: File_Area_Ancillary / File_Area_Metadata: - creation_date_time + creation_date_time: If no new contents are specified for label generation, the label will contain the diff --git a/pds4indextools/index_label_template_pds.xml b/pds4indextools/index_label_template_pds.xml index 5f7aca3..8b40748 100644 --- a/pds4indextools/index_label_template_pds.xml +++ b/pds4indextools/index_label_template_pds.xml @@ -20,25 +20,35 @@ $END_IF Product_Ancillary $IF(Citation_Information) + $IF(Citation_Information['author_list'] and isinstance(Citation_Information['author_list'], list)) + $FOR(Citation_Information['author_list']) + $VALUE$ + $END_FOR + $ELSE_IF(Citation_Information['author_list'] and not isinstance(Citation_Information['author_list'], list)) $Citation_Information['author_list']$ + $END_IF $Citation_Information['editor_list']$ $Citation_Information['publication_year']$ $Citation_Information['doi']$ + $IF(Citation_Information['keyword'] and isinstance(Citation_Information['keyword'], list)) $FOR(Citation_Information['keyword']) $VALUE$ $END_FOR + $ELSE_IF(Citation_Information['keyword'] and not isinstance(Citation_Information['keyword'], list)) + $Citation_Information['keyword']$ + $END_IF $Citation_Information['description']$ - $IF(Citation_Information.get('Funding_Acknowledgement')) + $IF(Citation_Information['Funding_Acknowledgement']) - $Funding_Acknowledgement['funding_source']$ - $Funding_Acknowledgement['funding_year']$ - $Funding_Acknowledgement['funding_award']$ - $Funding_Acknowledgement['funding_acknowledgement_text']$ + $Citation_Information['Funding_Acknowledgement']['funding_source']$ + $Citation_Information['Funding_Acknowledgement']['funding_year']$ + $Citation_Information['Funding_Acknowledgement']['funding_award']$ + $Citation_Information['Funding_Acknowledgement']['funding_acknowledgement_text']$ $END_IF $END_IF - $IF(Modification_Detail) + $IF(Modification_Detail and isinstance(Modification_Detail, list)) $FOR(field, k=Modification_Detail) @@ -48,6 +58,14 @@ $END_IF $END_FOR + $ELSE_IF(Modification_Detail) + + + $Modification_Detail['modification_date']$ + $Modification_Detail['version_id']$ + $Modification_Detail['description']$ + + $END_IF Creative Common Public License CC0 1.0 (2024) @@ -58,6 +76,7 @@ $END_IF + $IF(Internal_Reference or External_Reference or Source_Product_Internal or Source_Product_External) $IF(Internal_Reference) $FOR(field, k=Internal_Reference) @@ -98,6 +117,7 @@ $END_IF $END_FOR $END_IF + $END_IF $IF(Product_Ancillary) $ELSE diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index 9bf6439..c7d72da 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -11,11 +11,11 @@ import argparse from collections import namedtuple +from collections import Counter import csv from datetime import datetime import fnmatch import functools -import itertools from itertools import groupby from lxml import etree import os @@ -68,82 +68,53 @@ def convert_header_to_xpath(root, xml_header_path, namespaces): 'pds:Product_Observational/pds:Identification_Area[1]/pds:version_id[2]' """ sections = xml_header_path.split('/') + prefixes = namespaces.keys() xpath_final = '' portion = '' for sec in sections[1:]: + # portion = portion + section portion = f'{portion}/{sec}' + # grab the tag of that portion. tag = str(root.xpath(portion, namespaces=namespaces)[0].tag) + # if the section starts with '*', it's everything after the '*' if sec.startswith('*'): sec = sec[1:] - if ':' in sec: - sec = '' + # if sec starts with :, make it blank + elif any(f'{prefix}:' in sec for prefix in prefixes): + predicate = sec.split('[')[-1] + if predicate[0].isdigit(): + sec = f'[{sec.split('[')[-1]}' + else: + sec = '' + # xpath_final is the current path, then the tag, then section/ xpath_final = f'{xpath_final}/{tag}{sec}' return xpath_final -def correct_duplicates(label_results): +def clean_headers(df): """ - Correct numbering of XPaths to have correct predicates. - - Some namespaces do not contain predicates, and as a result, must be made artificially - unique via injected substrings. This function aids in the reformatting of these - strings so they match the syntax of the renumbering function. Note that this function - does not affect elements or attributes that natively contain the '_num' substring - (e.g., cassini:filter_name_1 and cassini:filter_name_2). + Clean the headers of a DataFrame by replacing certain characters with safer + alternatives and return a mapping of new to old headers. Parameters: - label_results (dict): The dictionary of XML results. This argument will be - mutated by the function. + df (pandas.DataFrame): The DataFrame whose headers need to be cleaned. - Example: - # XPaths in label_results shortened for readability - >>> keys = list(label_result) - >>> keys = [ - ../geom:SPICE_Kernel_Identification<1>/geom:kernel_type<1>, - ../geom:SPICE_Kernel_Identification<1>/geom:kernel_type_1<1>, - ../geom:SPICE_Kernel_Identification<1>/geom:kernel_type_2<1>, - ../geom:SPICE_Kernel_Identification<1>/geom:kernel_type_3<1>, - ../geom:SPICE_Kernel_Identification<1>/geom:kernel_type_4<1> - ] - >>> correct_duplicate(label_results) - >>> keys = list(label_result) - >>> keys = [ - ../geom:SPICE_Kernel_Identification<1>/geom:kernel_type<1>, - ../geom:SPICE_Kernel_Identification<2>/geom:kernel_type<1>, - ../geom:SPICE_Kernel_Identification<3>/geom:kernel_type<1>, - ../geom:SPICE_Kernel_Identification<4>/geom:kernel_type<1>, - ../geom:SPICE_Kernel_Identification<5>/geom:kernel_type<1> - ] + Returns: + dict: A dictionary mapping new headers to old headers. """ - element_names = set() - for key in list(label_results): - tag = key.split('/')[-1].split('<')[0] - number = tag.split('_')[-1] - if number.isdigit(): - cropped = tag.replace(f'_{number}', '') - if cropped in element_names: - if str(f'{cropped}_{number}<1>') in key: - key_new = key.replace((f'{cropped}_{number}<1>'), f'{cropped}<1>') - else: - key_new = key.replace(f'{cropped}_{number}', f'{cropped}<1>') - parent = key_new.split('/')[-2].split('<')[0] - key_new = key_new.replace(f'{parent}<1>', - f'{parent}<{str(int(number)+1)}>') - label_results[key_new] = label_results.pop(key) - element_names.add(tag) + # Create a mapping of old to new headers + header_map = {col: col.replace(':', '_') + .replace('/', '__') + .replace('<', '_') + .replace('>', '') for col in df.columns} + # Update the DataFrame's headers + df.rename(columns=header_map, inplace=True) -def clean_headers(df): - """ - Clean the headers of a DataFrame by replacing certain characters with safer - alternatives. + header_map = {v: k for k, v in list(header_map.items())} - Parameters: - df (pandas.DataFrame): The DataFrame whose headers need to be cleaned. - """ - return df.rename(columns=lambda x: x.replace( - ':', '_').replace('/', '__').replace('<', '_').replace('>', ''), inplace=True) + return header_map def default_value_for_nil(config, data_type, nil_value): @@ -429,21 +400,11 @@ def process_headers(label_results, key, root, namespaces, prefixes): prefixes (dict): A dictionary containing XML namespace prefixes. """ key_new = convert_header_to_xpath(root, key, namespaces) - # Replace namespaces with prefixes for namespace in prefixes: if namespace in key_new: key_new = key_new.replace('{' + namespace + '}', prefixes[namespace] + ':') - # Check if key_new already exists in label_results, append suffix if necessary - if key_new in label_results: - suffix_gen = itertools.count(start=1, step=1) - while True: - trial_key = f"{key_new}_{next(suffix_gen)}" - if trial_key not in label_results: - key_new = trial_key - break - label_results[key_new] = label_results.pop(key) @@ -785,6 +746,14 @@ def pad_column_values_and_headers(df): df = pd.DataFrame(rows) + if args.simplify_xpaths: + original_headers = df.columns.tolist() + simplified_headers = simplify_xpaths(original_headers) + df.columns = simplified_headers + + if args.clean_header_field_names: + clean_header_mapping = clean_headers(df) + if args.sort_by: sort_values = str(args.sort_by).split(',') try: @@ -793,9 +762,6 @@ def pad_column_values_and_headers(df): print(bad_sort) sys.exit(1) - if args.clean_header_field_names: - clean_headers(df) - if args.fixed_width: padded_df = pad_column_values_and_headers(df) print(f'Fixed-width index file generated at {output_csv_path}') @@ -805,6 +771,11 @@ def pad_column_values_and_headers(df): print(f'Index file generated at {output_csv_path}') df.to_csv(output_csv_path, index=False, na_rep='', lineterminator='\n') + if args.clean_header_field_names: + return clean_header_mapping + else: + return None + def find_base_attribute(xsd_tree, target_name, new_namespaces): """ @@ -1130,6 +1101,48 @@ def generate_unique_filename(base_name): return new_filename +def simplify_xpaths(headers): + """ + Simplifies a list of XPath headers by shortening each header to its tag and + namespace prefix, provided the tag is unique. + + This function processes a list of XPath-like strings (headers) and attempts to + simplify them to their last tag component. If a tag is unique within the list, + it replaces the full XPath header with the tag. If the tag is not unique + (i.e., multiple headers share the same tag), the full XPath header is retained. + + Args: + headers (list of str): A list of strings representing XPath headers. + + Returns: + list of str: A list of strings where unique tags have replaced their + corresponding full XPath headers, and non-unique tags remain unchanged. + """ + # If --simplify-xpaths is used, the XPath headers will be shortened to the + # element's tag and namespace prefix. This is contingent on the uniqueness of + # the XPath header; if more than one XPath header shares a tag, a namespace and a + # predicate value, the XPath header will remain whole. + tags = [] + matches = {} + + # Step 1: Gather all possible tags from labels + for header in headers: + tag = header.split('/')[-1] + tags.append(tag) + matches[header] = tag + + term_counts = Counter(tags) + + for ind, header in enumerate(headers): + tag = header.split('/')[-1] + if term_counts[tag] == 1: + headers[ind] = tag + else: + continue + + return headers + + class MultilineFormatter(argparse.HelpFormatter): """Class to allow multi-line help messages with argparse. @@ -1218,7 +1231,8 @@ def main(cmd_line=None): metavar='XPATHS_FILEPATH', help='Optional text file specifying which XPaths to ' 'scrape. If not specified, all XPaths found in ' - 'the label files are included.') + 'the label files are included. Only whole XPaths ' + 'can be specified.') limiting_results.add_argument('--output-headers-file', type=str, metavar='HEADERS_FILEPATH', @@ -1267,7 +1281,6 @@ def main(cmd_line=None): nillable_elements_info = {} collected_files = set() all_results = [] - tags = [] xsd_files = [] output_csv_path = None @@ -1352,6 +1365,7 @@ def main(cmd_line=None): # improve readability. Each XPath's namespace is replaced with its prefix for # faster reference. Duplicate XPaths are made unique to ensure all results are # present in the final product. + for key in list(label_results): process_headers(label_results, key, root, namespaces, prefixes) @@ -1375,7 +1389,6 @@ def main(cmd_line=None): for key in list(label_results): if 'cyfunction' in key: del label_results[key] - # The XPath headers must be renumbered to reflect which instance of the element # the column refers to. At this stage, duplicate XPaths may exist again due to # the reformatting. These duplicates are corrected to preserve the contents of @@ -1384,8 +1397,6 @@ def main(cmd_line=None): for old_xpath, new_xpath in xpath_map.items(): label_results[new_xpath] = label_results.pop(old_xpath) - correct_duplicates(label_results) - # Collect metadata about the label file. The label file's lid is scraped and # broken into multiple parts. This metadata can then be requested as additional # columns within the index file. @@ -1426,57 +1437,14 @@ def main(cmd_line=None): print('No results found: glob pattern(s) excluded all matches.') sys.exit(1) - # If --simplify-xpaths is used, the XPath headers will be shortened to the - # element's tag and namespace prefix. This is contingent on the uniqueness of - # the XPath header; if more than one XPath header shares a tag, a namespace and a - # predicate value, the XPath header will remain whole. if args.simplify_xpaths: - headers = {} - unique_tags_master = [] - - # Step 1: Gather all possible tags from labels + original_headers = {} for label_results in all_results: - keys = label_results.keys() - for key in keys: - tag = key.split('/')[-1] - tags.append(tag) - if key not in headers: - headers[key] = tag - - # For each label, collect all tags that only occur once. If a unique tag occurs - # multiple times within a label, that tag will be removed from the collective - # list of unique tags. - for label_results in all_results: - tags = [] - unique_tags = [] - names = [] - for key in keys: - tag = key.split('/')[-1] - tags.append(tag) - name = tag.split('<')[0] - names.append(name) - for tag in tags: - name = tag.split('<')[0] - if (tags.count(tag) == 1 and names.count(name) == 1 - and tag not in unique_tags): - unique_tags.append(tag) - - for tag in unique_tags: - unique_tags_master.append(tag) - - for ind, label_results in enumerate(all_results): - new_label_results = {} - for key, value in list(label_results.items()): - new_key = headers[key] - if key.split('/')[-1] in unique_tags_master: - new_label_results[new_key] = value - else: - new_label_results[key] = value - - all_results[ind] = new_label_results + for key in label_results.keys(): + original_headers[key] = key.split('/')[-1] if output_csv_path: - write_results_to_csv(all_results, args, output_csv_path) + clean_header_mapping = write_results_to_csv(all_results, args, output_csv_path) # To instead receive a list of available information available within a label or set # of labels, you may use --output-headers-file. This will take all of the keys of @@ -1495,6 +1463,8 @@ def main(cmd_line=None): # The file is now written and placed in a given location. If cleaned header # field names are requested, they are processed here before being written in. with open(output_txt_path, 'w') as output_fp: + if args.simplify_xpaths: + xpaths = simplify_xpaths(xpaths) for item in xpaths: if args.clean_header_field_names: verboseprint( @@ -1547,8 +1517,12 @@ def main(cmd_line=None): # file is fixed-width or delimited. for header in headers: whole_header = header + whole_header_length = len(whole_header) if args.fixed_width: header = header.strip() + if args.clean_header_field_names: + full_header = header + header = clean_header_mapping[header] if (header in valid_add_extra_file_info and 'lid' in header): true_type = 'pds:ASCII_LID' elif header == 'filename': @@ -1565,8 +1539,13 @@ def main(cmd_line=None): true_type = true_type.split(':')[-1] field_number += 1 - header_length = len(header.encode('utf-8')) - header_name = header + + if args.clean_header_field_names: + header_length = len(full_header.encode('utf-8')) + header_name = full_header + else: + header_length = len(header.encode('utf-8')) + header_name = header maximum_field_length = maximum_field_lengths[whole_header] header_info.append({'name': header_name, @@ -1576,7 +1555,10 @@ def main(cmd_line=None): 'field_length': maximum_field_length, 'maximum_field_length': maximum_field_length, 'offset': offset}) - offset += header_length + jump + if args.fixed_width: + offset += whole_header_length + jump + else: + offset += header_length + jump field_location = offset # The creation date of the index file is stored for later reference. diff --git a/test_files/expected/cleaned_headers_label_success.csv b/test_files/expected/cleaned_headers_label_success.csv new file mode 100644 index 0000000..efb79b3 --- /dev/null +++ b/test_files/expected/cleaned_headers_label_success.csv @@ -0,0 +1,3 @@ +pds_logical_identifier_1,pds_version_id_1,pds_title_1,pds_information_model_version_1,pds_author_list_1,pds_publication_year_1,pds_keyword_1,pds_keyword_2,pds_keyword_3,pds_Product_Observational__pds_Observation_Area_1__pds_Discipline_Area_1__geom_Geometry_1__geom_SPICE_Kernel_Files_1__geom_SPICE_Kernel_Identification_1__geom_kernel_type_1,pds_Product_Observational__pds_Observation_Area_1__pds_Discipline_Area_1__geom_Geometry_1__geom_SPICE_Kernel_Files_1__geom_SPICE_Kernel_Identification_1__geom_spice_kernel_file_name_1,pds_Product_Observational__pds_Observation_Area_1__pds_Discipline_Area_1__geom_Geometry_1__geom_SPICE_Kernel_Files_1__geom_comment_1,pds_Product_Observational__pds_Observation_Area_1__pds_Discipline_Area_1__geom_Geometry_1__geom_SPICE_Kernel_Files_2__geom_SPICE_Kernel_Identification_1__geom_kernel_type_1,pds_Product_Observational__pds_Observation_Area_1__pds_Discipline_Area_1__geom_Geometry_1__geom_SPICE_Kernel_Files_2__geom_SPICE_Kernel_Identification_1__geom_spice_kernel_file_name_1,pds_Product_Observational__pds_Observation_Area_1__pds_Discipline_Area_1__geom_Geometry_1__geom_SPICE_Kernel_Files_2__geom_comment_1,pds_Product_Observational__pds_Observing_System_1__pds_name_1,pds_Product_Observational__pds_Observing_System_1__pds_Observing_System_Component_1__pds_name_1,pds_Product_Observational__pds_Observing_System_1__pds_Observing_System_Component_1__pds_type_1,pds_Product_Observational__pds_Observing_System_2__pds_name_1,pds_Product_Observational__pds_Observing_System_2__pds_Observing_System_Component_1__pds_name_1,pds_Product_Observational__pds_Observing_System_2__pds_Observing_System_Component_1__pds_type_1,pds_Product_Observational__pds_Observing_System_2__pds_Observing_System_Component_1__pds_Internal_Reference_1__pds_lid_reference_1,pds_Product_Observational__pds_Observing_System_2__pds_Observing_System_Component_1__pds_Internal_Reference_1__pds_reference_type_1,pds_Product_Observational__pds_Observing_System_1__pds_Observing_System_Component_1__pds_Internal_Reference_1__pds_lid_reference_1,pds_Product_Observational__pds_Observing_System_1__pds_Observing_System_Component_1__pds_Internal_Reference_1__pds_reference_type_1 +urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n,1.0,Cassini ISS Image 1455200455n.img,1.11.0.0,"French, Richard G.",0003-01-01,kw1,kw2,kw3,SPK,ura111.bsp,These kernel files were used in the generation of the products in the parent bundle. Some or all of them may not have been used directly in the generation of this product.,SPK,earthstns_itrf93_040916.bsp,These kernel files were used in the generation of the products in the parent bundle. Some or all of them may not have been used directly in the generation of this product.,Cassini Orbiter Imaging Science Subsystem,Cassini Orbiter,Spacecraft,Another thing,Another thing,Spacecraft,urn:nasa:pds:context:instrument_host:spacecraft.co,is_instrument_host,, +urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n,1.0,Cassini ISS Image 1455200455n.img,1.11.0.0,"French, Richard G.",0003-01-01,kw1,,,SPK,ura111.bsp,These kernel files were used in the generation of the products in the parent bundle. Some or all of them may not have been used directly in the generation of this product.,,,,Cassini Orbiter Imaging Science Subsystem,Cassini Orbiter,Spacecraft,,,,,,urn:nasa:pds:context:instrument_host:spacecraft.co,is_instrument_host diff --git a/test_files/expected/cleaned_headers_label_success.xml b/test_files/expected/cleaned_headers_label_success.xml new file mode 100644 index 0000000..621f7a2 --- /dev/null +++ b/test_files/expected/cleaned_headers_label_success.xml @@ -0,0 +1,226 @@ + + + + + + urn:nasa:pds:rms_metadata:document_opus:cleaned_headers_label + 1.1 + Index File + 1.21.0.0 + Product_Ancillary + + Creative Common Public License CC0 1.0 (2024) + Creative Commons Zero (CC0) license information. + + urn:nasa:pds:system_bundle:document_pds4_standards:creative_commons_1.0.0::1.0 + product_to_license + + + + + + cleaned_headers_label.csv + index-table + 0002-02-02T00:00:00.00Z + 24837ed11b0e8ceb94102e1f22d95b31 + + +
+ 0 + 2183 + UTF-8 Text + Provides the column headers, separated by commas, for the data table. +
+ + 0 + 3370 + PDS DSV 1 + 3 + Line-Feed + Comma + + 25 + 0 + 2182 + + pds_logical_identifier_1 + 1 + ASCII_LID + 52 + + + + pds_version_id_1 + 2 + ASCII_Short_String_Collapsed + 3 + + + + pds_title_1 + 3 + ASCII_Short_String_Collapsed + 33 + + + + pds_information_model_version_1 + 4 + ASCII_Short_String_Collapsed + 8 + + + + pds_author_list_1 + 5 + UTF8_Text_Preserved + 18 + + + + pds_publication_year_1 + 6 + ASCII_Date_YMD + 10 + + + + pds_keyword_1 + 7 + UTF8_Short_String_Collapsed + 3 + + + + pds_keyword_2 + 8 + UTF8_Short_String_Collapsed + 3 + + + + pds_keyword_3 + 9 + UTF8_Short_String_Collapsed + 3 + + + + pds_Product_Observational__pds_Observation_Area_1__pds_Discipline_Area_1__geom_Geometry_1__geom_SPICE_Kernel_Files_1__geom_SPICE_Kernel_Identification_1__geom_kernel_type_1 + 10 + ASCII_Short_String_Collapsed + 3 + + + + pds_Product_Observational__pds_Observation_Area_1__pds_Discipline_Area_1__geom_Geometry_1__geom_SPICE_Kernel_Files_1__geom_SPICE_Kernel_Identification_1__geom_spice_kernel_file_name_1 + 11 + ASCII_File_Name + 10 + + + + pds_Product_Observational__pds_Observation_Area_1__pds_Discipline_Area_1__geom_Geometry_1__geom_SPICE_Kernel_Files_1__geom_comment_1 + 12 + ASCII_Text_Preserved + 171 + + + + pds_Product_Observational__pds_Observation_Area_1__pds_Discipline_Area_1__geom_Geometry_1__geom_SPICE_Kernel_Files_2__geom_SPICE_Kernel_Identification_1__geom_kernel_type_1 + 13 + ASCII_Short_String_Collapsed + 3 + + + + pds_Product_Observational__pds_Observation_Area_1__pds_Discipline_Area_1__geom_Geometry_1__geom_SPICE_Kernel_Files_2__geom_SPICE_Kernel_Identification_1__geom_spice_kernel_file_name_1 + 14 + ASCII_File_Name + 27 + + + + pds_Product_Observational__pds_Observation_Area_1__pds_Discipline_Area_1__geom_Geometry_1__geom_SPICE_Kernel_Files_2__geom_comment_1 + 15 + ASCII_Text_Preserved + 171 + + + + pds_Product_Observational__pds_Observing_System_1__pds_name_1 + 16 + UTF8_Short_String_Collapsed + 41 + + + + pds_Product_Observational__pds_Observing_System_1__pds_Observing_System_Component_1__pds_name_1 + 17 + UTF8_Short_String_Collapsed + 15 + + + + pds_Product_Observational__pds_Observing_System_1__pds_Observing_System_Component_1__pds_type_1 + 18 + ASCII_Short_String_Collapsed + 10 + + + + pds_Product_Observational__pds_Observing_System_2__pds_name_1 + 19 + UTF8_Short_String_Collapsed + 13 + + + + pds_Product_Observational__pds_Observing_System_2__pds_Observing_System_Component_1__pds_name_1 + 20 + UTF8_Short_String_Collapsed + 13 + + + + pds_Product_Observational__pds_Observing_System_2__pds_Observing_System_Component_1__pds_type_1 + 21 + ASCII_Short_String_Collapsed + 10 + + + + pds_Product_Observational__pds_Observing_System_2__pds_Observing_System_Component_1__pds_Internal_Reference_1__pds_lid_reference_1 + 22 + ASCII_LID + 50 + + + + pds_Product_Observational__pds_Observing_System_2__pds_Observing_System_Component_1__pds_Internal_Reference_1__pds_reference_type_1 + 23 + ASCII_Short_String_Collapsed + 18 + + + + pds_Product_Observational__pds_Observing_System_1__pds_Observing_System_Component_1__pds_Internal_Reference_1__pds_lid_reference_1 + 24 + ASCII_LID + 50 + + + + pds_Product_Observational__pds_Observing_System_1__pds_Observing_System_Component_1__pds_Internal_Reference_1__pds_reference_type_1 + 25 + ASCII_Short_String_Collapsed + 18 + + + + +
+
diff --git a/test_files/expected/label_success_1.xml b/test_files/expected/label_success_1.xml index aade34b..8d53f30 100644 --- a/test_files/expected/label_success_1.xml +++ b/test_files/expected/label_success_1.xml @@ -20,8 +20,6 @@ - - generated_label_1.csv diff --git a/test_files/expected/label_success_2.xml b/test_files/expected/label_success_2.xml index 23a5758..ea0070f 100644 --- a/test_files/expected/label_success_2.xml +++ b/test_files/expected/label_success_2.xml @@ -20,8 +20,6 @@ - - generated_label_2.csv diff --git a/test_files/expected/label_success_3.xml b/test_files/expected/label_success_3.xml index 2e6127e..6bb4f39 100644 --- a/test_files/expected/label_success_3.xml +++ b/test_files/expected/label_success_3.xml @@ -20,8 +20,6 @@ - - generated_label_3.csv diff --git a/test_files/expected/nested_label_success.txt b/test_files/expected/nested_label_success.txt new file mode 100644 index 0000000..b0f1733 --- /dev/null +++ b/test_files/expected/nested_label_success.txt @@ -0,0 +1,25 @@ +pds:logical_identifier<1> +pds:version_id<1> +pds:title<1> +pds:information_model_version<1> +pds:author_list<1> +pds:publication_year<1> +pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:SPICE_Kernel_Identification_Extra<1>/geom:kernel_type<1> +pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:SPICE_Kernel_Identification_Extra<1>/geom:spice_kernel_file_name<1> +pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:SPICE_Kernel_Identification_Extra<2>/geom:kernel_type<1> +pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:SPICE_Kernel_Identification_Extra<2>/geom:spice_kernel_file_name<1> +pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:SPICE_Kernel_Identification_Extra<3>/geom:kernel_type<1> +pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:SPICE_Kernel_Identification_Extra<3>/geom:spice_kernel_file_name<1> +pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:SPICE_Kernel_Identification_Extra<4>/geom:kernel_type<1> +pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:SPICE_Kernel_Identification_Extra<4>/geom:spice_kernel_file_name<1> +pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<2>/geom:kernel_type<1> +pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1> +geom:comment<1> +pds:Product_Observational/pds:Observing_System<1>/pds:name<1> +pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:name<1> +pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:type<1> +pds:Product_Observational/pds:Observing_System<2>/pds:name<1> +pds:Product_Observational/pds:Observing_System<2>/pds:Observing_System_Component<1>/pds:name<1> +pds:Product_Observational/pds:Observing_System<2>/pds:Observing_System_Component<1>/pds:type<1> +pds:lid_reference<1> +pds:reference_type<1> diff --git a/test_files/labels/nested_label.xml b/test_files/labels/nested_label.xml new file mode 100644 index 0000000..d6127be --- /dev/null +++ b/test_files/labels/nested_label.xml @@ -0,0 +1,74 @@ + + + + + + + urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n + 1.0 + Cassini ISS Image 1455200455n.img + 1.11.0.0 + + French, Richard G. + + + + + + + + + + SPK + ura111.bsp + + + SPK + vgr2.ura111.bsp + + + BPC + earth_720101_031229.bpc + + + LSK + naif0012.tls + + + + SPK + earthstns_itrf93_040916.bsp + + These kernel files were used in the generation of the products in the parent bundle. Some or all of them may not have been used directly in the generation of this product. + + + + + + Cassini Orbiter Imaging Science Subsystem + + Cassini Orbiter + Spacecraft + + + + Another thing + + Another thing + Spacecraft + + urn:nasa:pds:context:instrument_host:spacecraft.co + is_instrument_host + + + + \ No newline at end of file diff --git a/test_files/labels/rf-tester-label_1.xml b/test_files/labels/rf-tester-label_1.xml new file mode 100644 index 0000000..3f127bc --- /dev/null +++ b/test_files/labels/rf-tester-label_1.xml @@ -0,0 +1,66 @@ + + + + + + + urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n + 1.0 + Cassini ISS Image 1455200455n.img + 1.11.0.0 + + French, Richard G. + + kw1 + kw2 + kw3 + + + + + + + + SPK + ura111.bsp + + These kernel files were used in the generation of the products in the parent bundle. Some or all of them may not have been used directly in the generation of this product. + + + + SPK + earthstns_itrf93_040916.bsp + + These kernel files were used in the generation of the products in the parent bundle. Some or all of them may not have been used directly in the generation of this product. + + + + + + Cassini Orbiter Imaging Science Subsystem + + Cassini Orbiter + Spacecraft + + + + Another thing + + Another thing + Spacecraft + + urn:nasa:pds:context:instrument_host:spacecraft.co + is_instrument_host + + + + \ No newline at end of file diff --git a/test_files/labels/rf-tester-label_2.xml b/test_files/labels/rf-tester-label_2.xml new file mode 100644 index 0000000..8d62bbc --- /dev/null +++ b/test_files/labels/rf-tester-label_2.xml @@ -0,0 +1,50 @@ + + + + + + + urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n + 1.0 + Cassini ISS Image 1455200455n.img + 1.11.0.0 + + French, Richard G. + + kw1 + + + + + + + + SPK + ura111.bsp + + These kernel files were used in the generation of the products in the parent bundle. Some or all of them may not have been used directly in the generation of this product. + + + + + + Cassini Orbiter Imaging Science Subsystem + + Cassini Orbiter + Spacecraft + + urn:nasa:pds:context:instrument_host:spacecraft.co + is_instrument_host + + + + \ No newline at end of file diff --git a/test_files/samples/tester_config_label.yaml b/test_files/samples/tester_config_label.yaml index ada75dc..64b0d33 100644 --- a/test_files/samples/tester_config_label.yaml +++ b/test_files/samples/tester_config_label.yaml @@ -1,13 +1,23 @@ label-contents: title: Index file for my occultation bundle + Citation_Information: + author_list: + Emilie Simpson, + Robert French, + Mia Mace + editor_list: + publication_year: 2024 + doi: + keyword: [stellar, uranus, rings] + description: + Funding_Acknowledgement: Modification_Detail: - - modification_date: '2024-01-01' - version_id: 1.1 - description: | - This is a lengthy description of what this modification - changed in the bundle. - There were lots of changes. - - modification_date: '2023-01-01' - version_id: 1.0 - description: Initial release. + - modification_date: '2024-01-01' + version_id: 1.1 + description: This is a lengthy description of what this modification + changed in the bundle. + There were lots of changes. + - modification_date: '2023-01-01' + version_id: '1.0' + description: Initial release. diff --git a/tests/test_pds4_create_xml_index_blackbox.py b/tests/test_pds4_create_xml_index_blackbox.py index 0c0b395..84ab987 100644 --- a/tests/test_pds4_create_xml_index_blackbox.py +++ b/tests/test_pds4_create_xml_index_blackbox.py @@ -418,6 +418,19 @@ def compare_files(path_to_file, golden_file): ] ), + # Executable command: python pds4indextools/pds4_create_xml_index.py ../test_files/labels "nested_label.xml" --output-headers-file headers_nested.txt --simplify-xpaths + # Compare result to golden copy: + # test_files/expected/nested_label_success.txt + ( + str(EXPECTED_DIR / 'nested_label_success.txt'), + None, 'nested_label.txt', + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/nested_label.xml', + '--simplify-xpaths', + ] + ), + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_1.xml" --generate-label ancillary --config-file ../test_files/samples/tester_config.yaml --output-index-file generated_label_1.csv # Compare result to golden copy: # test_files/expected/label_success_1.csv @@ -476,7 +489,26 @@ def compare_files(path_to_file, golden_file): '--config-file', str(SAMPLES_DIR / 'tester_config.yaml') ] - ) + ), + + # Executable command: pds4_create_xml_index ../test_files/labels "rf-tester-label_*.xml" --generate-label metadata --config-file ../test_files/samples/tester_config.yaml --output-index-file cleaned_headers_label.csv --clean-header-field-names + # Compare result to golden copy: + # test_files/expected/cleaned_headers_label_success.csv + # test_files/expected/cleaned_headers_label_success.xml + ( + str(EXPECTED_DIR / 'cleaned_headers_label_success.csv'), + 'cleaned_headers_label.csv', None, + [ + str(TEST_FILES_DIR), + LABEL_NAME + '/rf-tester-label_*.xml', + '--generate-label', + 'metadata', + '--config-file', + str(SAMPLES_DIR / 'tester_config.yaml'), + '--clean-header-field-names', + '--simplify-xpaths' + ] + ), ] ) def test_success(golden_file, new_file_index, new_file_headers, cmd_line): diff --git a/tests/test_pds4_create_xml_index_whitebox.py b/tests/test_pds4_create_xml_index_whitebox.py index 62a8c83..814f120 100644 --- a/tests/test_pds4_create_xml_index_whitebox.py +++ b/tests/test_pds4_create_xml_index_whitebox.py @@ -286,28 +286,6 @@ def test_get_creation_date(create_temp_file, platform_name): assert datetime.fromisoformat(creation_date) -def test_correct_duplicates(): - label_results = { - '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name<1>': 1, - '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name_1<1>': 2, - '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name_2<1>': 3, - '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name_3<1>': 4, - '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name_4<1>': 5, - '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name_5': 6 - } - - tools.correct_duplicates(label_results) - - assert label_results == { - '../geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name<1>': 1, - '../geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1>': 2, - '../geom:SPICE_Kernel_Identification<3>/geom:spice_kernel_file_name<1>': 3, - '../geom:SPICE_Kernel_Identification<4>/geom:spice_kernel_file_name<1>': 4, - '../geom:SPICE_Kernel_Identification<5>/geom:spice_kernel_file_name<1>': 5, - '../geom:SPICE_Kernel_Identification<6>/geom:spice_kernel_file_name<1>': 6 - } - - def test_update_nillable_elements_from_xsd_file(): xsd_files = [] nillable_elements_info = {} From d4411fd95180c5e5c68c5a83c1087cb95eb9e1ca Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Thu, 17 Oct 2024 15:24:16 -0700 Subject: [PATCH 20/24] Fixing f-string format --- pds4indextools/pds4_create_xml_index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index c7d72da..f9078ac 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -83,10 +83,10 @@ def convert_header_to_xpath(root, xml_header_path, namespaces): elif any(f'{prefix}:' in sec for prefix in prefixes): predicate = sec.split('[')[-1] if predicate[0].isdigit(): - sec = f'[{sec.split('[')[-1]}' + sec = f"[{sec.split('[')[-1]}" else: sec = '' - # xpath_final is the current path, then the tag, then section/ + # xpath_final is the current path, then the tag, then section xpath_final = f'{xpath_final}/{tag}{sec}' return xpath_final From 7ca45a66f432f7854f2778d9dcd1fa1f2bfd20aa Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Wed, 13 Nov 2024 15:32:36 -0800 Subject: [PATCH 21/24] Making changes according to pull request --- pds4indextools/index_label_template_pds.xml | 4 +- pds4indextools/pds4_create_xml_index.py | 68 +++++++++++++------ test_files/expected/label_success_2.xml | 16 ++--- ...ster-label_1.xml => rf_tester_label_1.xml} | 0 ...ster-label_2.xml => rf_tester_label_2.xml} | 0 tests/test_pds4_create_xml_index_blackbox.py | 28 +++++--- 6 files changed, 75 insertions(+), 41 deletions(-) rename test_files/labels/{rf-tester-label_1.xml => rf_tester_label_1.xml} (100%) rename test_files/labels/{rf-tester-label_2.xml => rf_tester_label_2.xml} (100%) diff --git a/pds4indextools/index_label_template_pds.xml b/pds4indextools/index_label_template_pds.xml index 8b40748..bc9142a 100644 --- a/pds4indextools/index_label_template_pds.xml +++ b/pds4indextools/index_label_template_pds.xml @@ -38,13 +38,15 @@ $END_IF $Citation_Information['keyword']$ $END_IF $Citation_Information['description']$ - $IF(Citation_Information['Funding_Acknowledgement']) + $IF('Funding_Acknowledgement' in Citation_Information) + $IF(Citation_Information['Funding_Acknowledgement']) $Citation_Information['Funding_Acknowledgement']['funding_source']$ $Citation_Information['Funding_Acknowledgement']['funding_year']$ $Citation_Information['Funding_Acknowledgement']['funding_award']$ $Citation_Information['Funding_Acknowledgement']['funding_acknowledgement_text']$ + $END_IF $END_IF $END_IF diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index f9078ac..bd8fa00 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -104,10 +104,7 @@ def clean_headers(df): dict: A dictionary mapping new headers to old headers. """ # Create a mapping of old to new headers - header_map = {col: col.replace(':', '_') - .replace('/', '__') - .replace('<', '_') - .replace('>', '') for col in df.columns} + header_map = {col: header_cleaner(col) for col in df.columns} # Update the DataFrame's headers df.rename(columns=header_map, inplace=True) @@ -308,6 +305,24 @@ def search_type(xsd_file, tag, namespaces): return None +def header_cleaner(header): + """ + Clean a header string. + + Parameters: + header (str): The header string to be cleaned. + + Returns: + str: The cleaned header string. + """ + return ( + header.replace(':', '_') + .replace('/', '__') + .replace('<', '_') + .replace('>', '') + ) + + def load_config_file( default_config_file=Path(__file__).resolve().parent/'default_config.yaml', specified_config_files=None): @@ -746,6 +761,17 @@ def pad_column_values_and_headers(df): df = pd.DataFrame(rows) + if ( + df.map(lambda x: isinstance(x, str) and ('"' in x or "'" in x)) + .any() + .any() + and not args.fixed_width + ): + print("Warning: scraped contents of labels contains quotes. This is " + "against PDS4 data standards. Index file and subsequent label file will " + "not be generated.") + sys.exit(1) + if args.simplify_xpaths: original_headers = df.columns.tolist() simplified_headers = simplify_xpaths(original_headers) @@ -764,12 +790,15 @@ def pad_column_values_and_headers(df): if args.fixed_width: padded_df = pad_column_values_and_headers(df) + print(f'Fixed-width index file generated at {output_csv_path}') - padded_df.to_csv(output_csv_path, index=False, na_rep='', lineterminator='\n') + padded_df.to_csv(output_csv_path, index=False, na_rep='', lineterminator='\n', + quoting=csv.QUOTE_MINIMAL) else: print(f'Index file generated at {output_csv_path}') - df.to_csv(output_csv_path, index=False, na_rep='', lineterminator='\n') + df.to_csv(output_csv_path, index=False, na_rep='', lineterminator='\n', + quoting=csv.QUOTE_MINIMAL) if args.clean_header_field_names: return clean_header_mapping @@ -1107,21 +1136,19 @@ def simplify_xpaths(headers): namespace prefix, provided the tag is unique. This function processes a list of XPath-like strings (headers) and attempts to - simplify them to their last tag component. If a tag is unique within the list, - it replaces the full XPath header with the tag. If the tag is not unique - (i.e., multiple headers share the same tag), the full XPath header is retained. + simplify them to their last tag component. If --simplify-xpaths is used, the XPath + headers will be shortened to the element's tag and namespace prefix. This is + contingent on the uniqueness of the XPath header; if more than one XPath header + shares a tag, a namespace and a predicate value, the XPath header will remain whole. - Args: + Parameters: headers (list of str): A list of strings representing XPath headers. Returns: list of str: A list of strings where unique tags have replaced their corresponding full XPath headers, and non-unique tags remain unchanged. """ - # If --simplify-xpaths is used, the XPath headers will be shortened to the - # element's tag and namespace prefix. This is contingent on the uniqueness of - # the XPath header; if more than one XPath header shares a tag, a namespace and a - # predicate value, the XPath header will remain whole. + # tags = [] matches = {} @@ -1131,14 +1158,14 @@ def simplify_xpaths(headers): tags.append(tag) matches[header] = tag + # Step 2: Count the number of instances of each tag term_counts = Counter(tags) + # Step 3: If a tag occurs only once, shorten it. for ind, header in enumerate(headers): tag = header.split('/')[-1] if term_counts[tag] == 1: headers[ind] = tag - else: - continue return headers @@ -1300,13 +1327,13 @@ def main(cmd_line=None): prev_len = len(collected_files) collected_files.update(files) if len(collected_files) == prev_len: - print(f"No files found for pattern: {pattern}") + print(f'No new files found for pattern: {pattern}') verboseprint(f'{len(collected_files)} matching file(s) found') label_files = list(collected_files) label_files.sort() - if label_files == []: + if len(label_files) == 0: print(f'No files matching any patterns found in directory: {directory_path}') sys.exit(1) @@ -1469,8 +1496,7 @@ def main(cmd_line=None): if args.clean_header_field_names: verboseprint( '--clean-header-field-names active. Headers reformatted.') - item = item.replace( - ':', '_').replace('/', '__').replace('<', '_').replace('>', '') + item = header_cleaner(item) output_fp.write("%s\n" % item) print(f'XPath headers file generated at {output_txt_path}.') @@ -1559,7 +1585,7 @@ def main(cmd_line=None): offset += whole_header_length + jump else: offset += header_length + jump - field_location = offset + field_location = offset + 1 # The creation date of the index file is stored for later reference. creation_date = get_creation_date(index_file) diff --git a/test_files/expected/label_success_2.xml b/test_files/expected/label_success_2.xml index ea0070f..553803e 100644 --- a/test_files/expected/label_success_2.xml +++ b/test_files/expected/label_success_2.xml @@ -54,56 +54,56 @@ pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1> 2 - 79 + 80 ASCII_Short_String_Collapsed 70 pds:Product_Observational/pds:Identification_Area<1>/pds:title<1> 3 - 150 + 151 ASCII_Short_String_Collapsed 65 pds:Product_Observational/pds:Identification_Area<1>/pds:information_model_version<1> 4 - 216 + 217 ASCII_Short_String_Collapsed 85 pds:Product_Observational/pds:Observing_System<1>/pds:name<1> 5 - 302 + 303 UTF8_Short_String_Collapsed 61 pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:name<1> 6 - 364 + 365 UTF8_Short_String_Collapsed 95 pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:type<1> 7 - 460 + 461 ASCII_Short_String_Collapsed 95 pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:Internal_Reference<1>/pds:lid_reference<1> 8 - 556 + 557 ASCII_LID 130 pds:Product_Observational/pds:Observing_System<1>/pds:Observing_System_Component<1>/pds:Internal_Reference<1>/pds:reference_type<1> 9 - 687 + 688 ASCII_Short_String_Collapsed 131 diff --git a/test_files/labels/rf-tester-label_1.xml b/test_files/labels/rf_tester_label_1.xml similarity index 100% rename from test_files/labels/rf-tester-label_1.xml rename to test_files/labels/rf_tester_label_1.xml diff --git a/test_files/labels/rf-tester-label_2.xml b/test_files/labels/rf_tester_label_2.xml similarity index 100% rename from test_files/labels/rf-tester-label_2.xml rename to test_files/labels/rf_tester_label_2.xml diff --git a/tests/test_pds4_create_xml_index_blackbox.py b/tests/test_pds4_create_xml_index_blackbox.py index 84ab987..e51371c 100644 --- a/tests/test_pds4_create_xml_index_blackbox.py +++ b/tests/test_pds4_create_xml_index_blackbox.py @@ -500,7 +500,7 @@ def compare_files(path_to_file, golden_file): 'cleaned_headers_label.csv', None, [ str(TEST_FILES_DIR), - LABEL_NAME + '/rf-tester-label_*.xml', + LABEL_NAME + '/rf_tester_label_*.xml', '--generate-label', 'metadata', '--config-file', @@ -522,15 +522,16 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): # Call main() function with the simulated command line arguments tools.main(cmd_line) - path_to_file = ROOT_DIR / 'index.csv' + path_to_index_file = ROOT_DIR / 'index.csv' - compare_files(path_to_file, golden_file) - os.remove(path_to_file) + compare_files(path_to_index_file, golden_file) + os.remove(path_to_index_file) else: # THE PATH TO THE NEW FILE if new_file_index: path_to_file = temp_dir_path / new_file_index + path_to_label_file = ROOT_DIR / 'index.xml' cmd_line.append('--output-index-file') cmd_line.append(str(path_to_file)) # Call main() function with the simulated command line arguments @@ -544,6 +545,8 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): assert os.path.isfile(label_path) compare_files(label_path, golden_label) + if os.path.isfile(path_to_label_file): + os.remove(path_to_label_file) if new_file_headers: path_to_file = temp_dir_path / new_file_headers @@ -647,13 +650,16 @@ def test_success(golden_file, new_file_index, new_file_headers, cmd_line): ] ) def test_failures(cmd_line): - # Call main() function with the simulated command line arguments - with pytest.raises(SystemExit) as e: - tools.main(cmd_line) - assert e.type == SystemExit - assert e.value.code != 0 # Check that the exit code indicates failure - if os.path.isfile('hdout.txt'): - os.remove('hdout.txt') + try: + # Call main() function with the simulated command line arguments + with pytest.raises(SystemExit) as e: + tools.main(cmd_line) + assert e.type == SystemExit + assert e.value.code != 0 # Check that the exit code indicates failure + finally: + # Ensure hdout.txt is deleted regardless of test outcome + if os.path.isfile('hdout.txt'): + os.remove('hdout.txt') @pytest.mark.parametrize( From 864ee8989e954c3b85f2b54fcfaa1ded904a5b2e Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Wed, 13 Nov 2024 15:37:48 -0800 Subject: [PATCH 22/24] Using DataFrame.applymap(), not DataFrame.map() --- pds4indextools/pds4_create_xml_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index bd8fa00..a608a35 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -762,7 +762,7 @@ def pad_column_values_and_headers(df): df = pd.DataFrame(rows) if ( - df.map(lambda x: isinstance(x, str) and ('"' in x or "'" in x)) + df.applymap(lambda x: isinstance(x, str) and ('"' in x or "'" in x)) .any() .any() and not args.fixed_width From 5548c8928b6a97ebc1b2bd1000ec80810bcb7ffe Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Tue, 19 Nov 2024 13:41:12 -0800 Subject: [PATCH 23/24] unified column order for extra file info, removed Python 3.8 requirement --- .github/workflows/run-tests.yml | 2 +- pds4indextools/pds4_create_xml_index.py | 13 ++++++++++++- pyproject.toml | 3 +-- tests/test_pds4_create_xml_index_blackbox.py | 2 +- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index b6810de..7eaf9a8 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -31,7 +31,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12'] fail-fast: false steps: - name: Checkout diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index a608a35..4307496 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -762,7 +762,7 @@ def pad_column_values_and_headers(df): df = pd.DataFrame(rows) if ( - df.applymap(lambda x: isinstance(x, str) and ('"' in x or "'" in x)) + df.map(lambda x: isinstance(x, str) and ('"' in x)) .any() .any() and not args.fixed_width @@ -1354,6 +1354,17 @@ def main(cmd_line=None): else: elements_to_scrape = None + if args.add_extra_file_info: + if elements_to_scrape is None: + elements_to_scrape = args.add_extra_file_info + else: + # Ensure add-extra-file-info fields appear first, respecting their order + # in the command line + elements_to_scrape = args.add_extra_file_info + [ + xpath for xpath in elements_to_scrape + if xpath not in args.add_extra_file_info + ] + # For each file in label_files, load in schema files and namespaces for reference. # Traverse the label file and scrape the desired contents. Place these contents # into a dictionary to later parse into a csv file. diff --git a/pyproject.toml b/pyproject.toml index 2ac6305..f34e16a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "rms-pds4indextools" dynamic = ["version"] description = "pds4indextools" readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" dependencies = [ "lxml", "pandas", @@ -28,7 +28,6 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Utilities", "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", diff --git a/tests/test_pds4_create_xml_index_blackbox.py b/tests/test_pds4_create_xml_index_blackbox.py index e51371c..c3add2d 100644 --- a/tests/test_pds4_create_xml_index_blackbox.py +++ b/tests/test_pds4_create_xml_index_blackbox.py @@ -233,7 +233,7 @@ def compare_files(path_to_file, golden_file): ), # Testing --add-extra-file-info - # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_2.xml" --limit-xpaths-file ../test_files/samples/element_1.txt --add-extra-file-info filename,filepath --output-index-file extra_file_info_1.csv + # Executable command: pds4_create_xml_index ../test_files/labels "tester_label_2.xml" --limit-xpaths-file ../test_files/samples/element_extra_file_info.txt --add-extra-file-info filename,filepath --output-index-file extra_file_info_1.csv # Compare result to golden copy: # test_files/expected/extra_file_info_success_1.csv ( From a299746420a4fc9611f33e810f2a63c7f4681fb6 Mon Sep 17 00:00:00 2001 From: Emilie Simpson Date: Mon, 25 Nov 2024 16:20:41 -0800 Subject: [PATCH 24/24] limit-xpaths-file takes priority over add-extra-file-info term order --- pds4indextools/pds4_create_xml_index.py | 58 +++++++++++++++++++------ 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py index 4307496..d12bf50 100644 --- a/pds4indextools/pds4_create_xml_index.py +++ b/pds4indextools/pds4_create_xml_index.py @@ -714,7 +714,7 @@ def update_nillable_elements_from_xsd_file(xsd_file, nillable_elements_info): nillable_elements_info[name] = 'External or built-in type' -def write_results_to_csv(results_list, args, output_csv_path): +def write_results_to_csv(results_list, new_columns, args, output_csv_path): """ Write results from a list of dictionaries to a CSV file. @@ -761,6 +761,17 @@ def pad_column_values_and_headers(df): df = pd.DataFrame(rows) + if new_columns is not None: + new_columns_sorted = sorted(new_columns.items(), key=lambda x: x[1][0]) + + for col_name, (index, col_values) in new_columns_sorted: + # If the column already exists, remove it temporarily + if col_name in df.columns: + df = df.drop(columns=[col_name]) + + # Insert the column at the desired index + df.insert(index, col_name, col_values) + if ( df.map(lambda x: isinstance(x, str) and ('"' in x)) .any() @@ -1309,6 +1320,7 @@ def main(cmd_line=None): collected_files = set() all_results = [] xsd_files = [] + extra_file_info_ind = {} output_csv_path = None output_txt_path = None @@ -1339,8 +1351,6 @@ def main(cmd_line=None): # Loading in additional patterns from --limit-xpaths-file, if applicable, if args.limit_xpaths_file: - verboseprint( - f'Element file {args.limit_xpaths_file} used for additional patterns.') with open(args.limit_xpaths_file, 'r') as limit_xpaths_file: elements_to_scrape = [line.strip() for line in limit_xpaths_file] verboseprint('Elements to scrape:') @@ -1354,16 +1364,14 @@ def main(cmd_line=None): else: elements_to_scrape = None - if args.add_extra_file_info: - if elements_to_scrape is None: - elements_to_scrape = args.add_extra_file_info - else: - # Ensure add-extra-file-info fields appear first, respecting their order - # in the command line - elements_to_scrape = args.add_extra_file_info + [ - xpath for xpath in elements_to_scrape - if xpath not in args.add_extra_file_info - ] + if ( + args.add_extra_file_info + and args.limit_xpaths_file + and elements_to_scrape is not None + ): + for x in elements_to_scrape: + if x in valid_add_extra_file_info: + extra_file_info_ind[x] = elements_to_scrape.index(x) # For each file in label_files, load in schema files and namespaces for reference. # Traverse the label file and scrape the desired contents. Place these contents @@ -1458,6 +1466,15 @@ def main(cmd_line=None): all_results.append(label_results) + for label_results in all_results: + if extra_file_info_ind != {}: + new_columns = {} + for key in extra_file_info_ind.keys(): + values = [d[key] for d in all_results] + new_columns[key] = (extra_file_info_ind[key], values) + else: + new_columns = None + if args.add_extra_file_info and elements_to_scrape is not None: elements_to_scrape = args.add_extra_file_info + elements_to_scrape @@ -1482,7 +1499,8 @@ def main(cmd_line=None): original_headers[key] = key.split('/')[-1] if output_csv_path: - clean_header_mapping = write_results_to_csv(all_results, args, output_csv_path) + clean_header_mapping = write_results_to_csv(all_results, new_columns, args, + output_csv_path) # To instead receive a list of available information available within a label or set # of labels, you may use --output-headers-file. This will take all of the keys of @@ -1498,6 +1516,18 @@ def main(cmd_line=None): if xpath not in xpaths: xpaths.append(xpath) + if new_columns is not None: + # Sort new elements by index + new_elements_sorted = sorted(new_columns.items(), key=lambda x: x[1][0]) + + # Insert new elements into xpaths + for name, (index, value) in new_elements_sorted: + # Remove the value if it exists + if name in xpaths: + xpaths.remove(name) + # Insert at the desired index + xpaths.insert(index, name) + # The file is now written and placed in a given location. If cleaned header # field names are requested, they are processed here before being written in. with open(output_txt_path, 'w') as output_fp: