Skip to content

Commit

Permalink
Fixing minor bugs, changing scrape_namespaces for timeout issue (#26)
Browse files Browse the repository at this point in the history
* Fixing bug fixes according to issues 16, 17, 18

* Making changes according to pull request

* Fixing metavar value for --generate-label

* Commiting changes before merge

* Fixed bugs in code, added new error message for sort-by

* Changed ':' to 'inapplicable'

* Fixing the unit test to work with new scrape_namespaces function

* Making flake8 compliant

* Fixing another failing unit test

* Adding a sys.exit(1) to the --sort-by bad sort key error catch
  • Loading branch information
esimpsons3ti authored Jul 9, 2024
1 parent b19b4aa commit aa8bafa
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 31 deletions.
45 changes: 18 additions & 27 deletions pds4indextools/pds4_create_xml_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ def default_value_for_nil(config, data_type, nil_value):
default = config[data_type].getint(nil_value)
elif data_type == 'pds:ASCII_Real':
default = config[data_type].getfloat(nil_value)
elif data_type is None:
default = None
else:
default = config[data_type][nil_value]

Expand Down Expand Up @@ -553,18 +555,18 @@ def store_element_text(element, tree, results_dict, xsd_files, nillable_elements
f'has no associated text: {tag}')
true_type = None
for xsd_file in xsd_files:
namespaces = scrape_namespaces(xsd_file)
xsd_tree = download_xsd_file(xsd_file)
namespaces = scrape_namespaces(xsd_tree)
true_type = find_base_attribute(xsd_tree, tag, namespaces)
if true_type:
break # Exit the loop once true_type is found

if not true_type:
modified_tag = tag + "_WO_Units"
for xsd_file in xsd_files:
xsd_tree = download_xsd_file(xsd_file)
namespaces = scrape_namespaces(xsd_file)
true_type = find_base_attribute(xsd_tree, modified_tag)
namespaces = scrape_namespaces(xsd_tree)
true_type = find_base_attribute(xsd_tree, modified_tag,
namespaces)
if true_type:
break

Expand Down Expand Up @@ -715,7 +717,12 @@ def pad_column_values_and_headers(df):

if args.sort_by:
sort_values = str(args.sort_by).split(',')
df.sort_values(by=sort_values, inplace=True)
try:
df.sort_values(by=sort_values, inplace=True)
except KeyError as bad_sort:
print(f'Unknown sort key {bad_sort}. For a list of available sort keys, use '
f'the --output-headers-file option.')
sys.exit(1)

if args.clean_header_field_names:
clean_headers(df)
Expand Down Expand Up @@ -856,27 +863,16 @@ def get_base_type(query):
return None


def scrape_namespaces(xsd_url):
def scrape_namespaces(tree):
"""
Fetch and parse an XSD file from a given URL to extract namespace declarations.
Parameters:
xsd_url (str): The URL of the XSD file to be fetched and parsed.
xsd_tree (etree._Element): The XML schema tree.
Returns:
dict: A dictionary containing the namespace declarations found in the XSD file.
Raises:
ValueError: If the XSD file cannot be retrieved (HTTP status code is not 200).
"""
# Fetch XSD content from the URL
response = requests.get(xsd_url)
if response.status_code != 200:
# Handle error if XSD file cannot be retrieved
raise ValueError(f"Failed to fetch XSD file from URL: {xsd_url}")

# Parse the XSD content
tree = etree.fromstring(response.content)

# Extract namespace declarations
namespaces = tree.nsmap
Expand Down Expand Up @@ -1441,27 +1437,20 @@ def main(cmd_line=None):
sys.exit(1)

header_info = []
sniffer = csv.Sniffer()

# The index file is opened and read for the contents of the headers. The delimiter
# is also found for later reference.
with open(index_file, 'r', encoding='utf-8') as index_fp:
full_header = index_fp.readline()
full_header_length = len(full_header)
try:
sample_data = index_fp.read(5000)
delimiter = sniffer.sniff(sample_data).delimiter
index_fp.seek(0) # Reset file pointer to the beginning
except csv.Error:
print(f'Index file {index_file} is not a CSV or tab-separated file.')
sys.exit(1)
index_fp.seek(0) # Reset file pointer to the beginning

reader = csv.reader(index_fp, delimiter=',')
headers = next(reader)

offset = 0
field_number = 0
jump = len(delimiter)
jump = 1
field_location = 1
maximum_field_lengths = compute_max_field_lengths(index_file)

Expand Down Expand Up @@ -1501,6 +1490,8 @@ def main(cmd_line=None):
if true_type:
break

if true_type is None:
true_type = ':inapplicable'
true_type = true_type.split(':')[-1]
field_number += 1
header_length = len(header.encode('utf-8'))
Expand Down
6 changes: 2 additions & 4 deletions tests/test_pds4_create_xml_index_whitebox.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,14 +224,12 @@ def test_clean_headers():


def test_scrape_namespaces():
ns = tools.scrape_namespaces('https://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1B00.xsd')
tree = tools.download_xsd_file('https://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1B00.xsd')
ns = tools.scrape_namespaces(tree)

assert ns == {'xs': 'http://www.w3.org/2001/XMLSchema',
'pds': 'http://pds.nasa.gov/pds4/pds/v1'}

with pytest.raises(ValueError):
tools.scrape_namespaces('https://pds.nasa.gov/pds4/pds/v1/badschema.xsd')


def test_get_longest_row_length():
filename = expected_dir / 'extra_file_info_success_1.csv'
Expand Down

0 comments on commit aa8bafa

Please sign in to comment.