From d201241eed72efe2c922c7fbd15c7ba3467f4b97 Mon Sep 17 00:00:00 2001 From: adam-t-shaw Date: Fri, 11 Oct 2019 10:09:00 -0700 Subject: [PATCH 01/11] Renaming inputs --- src/mqm/mqm_tool.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/mqm/mqm_tool.py b/src/mqm/mqm_tool.py index 6dea8e5..f587947 100644 --- a/src/mqm/mqm_tool.py +++ b/src/mqm/mqm_tool.py @@ -7,6 +7,7 @@ from .utility import Utility from .geo_process import GeoProcessor import argparse +import gzip import ast @@ -95,7 +96,7 @@ def get_argument(): This function grabs all of the arguments that the program needs. Returns: - args.folderPath: an input folder path. + folder_path: an input folder path. args.maxDepth: a maximum tree depth. output_folder: a result folder. int(args.countNum): a count number for a stop condition in the first k-d tree. @@ -107,17 +108,20 @@ def get_argument(): """ # declare arguments and variables parser = argparse.ArgumentParser() - parser.add_argument('--folderPath', type=str, default='', help='path to an input folder') + parser.add_argument('--input', type=str, default='', help='path to an input folder') parser.add_argument('--maxDepth', type=str, default='10', help='max depth of a k-d tree') parser.add_argument('--countNum', type=str, default='10', help='a count value for a stop condition') parser.add_argument('--gridPercent', type=str, default='0.9', help='a grid percentage') parser.add_argument('--maxCount', type=str, default='', help='maximum count to the second k-d tree') + parser.add_argument('--output', type=str, help='path to an output folder') args = parser.parse_args() max_count = -1 path = 'histogram' geojson_path = 'geojson' - folder_path = os.path.normpath(args.folderPath) - output_folder = os.path.join(os.path.split(folder_path)[0], 'result') + folder_path = os.path.normpath(args.input) + if os.path.splitext(folder_path)[-1] == '.gz': + folder_path = gzip.decompress(folder_path) + output_folder = os.path.normpath(args.output) if args.maxCount: max_count = int(args.maxCount) From ad72a036a1bb169eac9444cac2d7281f551e167e Mon Sep 17 00:00:00 2001 From: adam-t-shaw Date: Fri, 11 Oct 2019 11:10:10 -0700 Subject: [PATCH 02/11] Handling updates --- src/mqm/geo_process.py | 8 ++++++-- src/mqm/mqm_tool.py | 11 +++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/mqm/geo_process.py b/src/mqm/geo_process.py index bd7f04d..a758072 100644 --- a/src/mqm/geo_process.py +++ b/src/mqm/geo_process.py @@ -4,6 +4,7 @@ import os from area import area from .utility import Utility +import gzip import argparse @@ -217,8 +218,11 @@ def bounding_box_process(self): name_num_list.append([os.path.splitext(f)[0].split('-')[0], int(os.path.splitext(f)[0].split('-')[2])]) # open geojson files - with open(os.path.join(self.folder_path, f), encoding='utf-8') as new_f: - data = json.load(new_f) + if os.path.splitext(os.path.join(self.folder_path, f))[-1] == '.gz': + new_f = gzip.open(os.path.join(self.folder_path, f), encoding='utf-8') + else: + new_f = open(os.path.join(self.folder_path, f), encoding='utf-8') + data = json.load(new_f) # randomly generate unique integers (flag ids) end_point = start_point + len(data['features']) diff --git a/src/mqm/mqm_tool.py b/src/mqm/mqm_tool.py index f587947..e767714 100644 --- a/src/mqm/mqm_tool.py +++ b/src/mqm/mqm_tool.py @@ -8,6 +8,8 @@ from .geo_process import GeoProcessor import argparse import gzip +from pathlib import Path +from zipfile import ZipFile import ast @@ -119,8 +121,13 @@ def get_argument(): path = 'histogram' geojson_path = 'geojson' folder_path = os.path.normpath(args.input) - if os.path.splitext(folder_path)[-1] == '.gz': - folder_path = gzip.decompress(folder_path) + ext = os.path.splitext(folder_path)[1] + if ext == '.gz': + folder_path = gzip.open(folder_path, 'r') + elif ext == '.zip': + ext_path = os.path.splitext(folder_path)[0] + ZipFile(folder_path, 'r').extractall(os.path.dirname(ext_path)) + folder_path = ext_path output_folder = os.path.normpath(args.output) if args.maxCount: From 3c1c5f0fd7112abd846c52145517a34ef9309e58 Mon Sep 17 00:00:00 2001 From: adam-t-shaw Date: Fri, 11 Oct 2019 11:36:23 -0700 Subject: [PATCH 03/11] Handling updates and removing countrified csv --- src/mqm/mqm_tool.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/mqm/mqm_tool.py b/src/mqm/mqm_tool.py index e767714..214ebeb 100644 --- a/src/mqm/mqm_tool.py +++ b/src/mqm/mqm_tool.py @@ -121,13 +121,10 @@ def get_argument(): path = 'histogram' geojson_path = 'geojson' folder_path = os.path.normpath(args.input) - ext = os.path.splitext(folder_path)[1] - if ext == '.gz': - folder_path = gzip.open(folder_path, 'r') - elif ext == '.zip': + if os.path.splitext(folder_path)[1] == '.zip': ext_path = os.path.splitext(folder_path)[0] - ZipFile(folder_path, 'r').extractall(os.path.dirname(ext_path)) - folder_path = ext_path + ZipFile(folder_path, 'r').extractall('temp') + folder_path = 'temp' output_folder = os.path.normpath(args.output) if args.maxCount: @@ -282,10 +279,6 @@ def process_single_folder(input_folder, folder_path, maximum_level, count_num, g initial_area = geo_processor.get_initial_extend_area(out_BB) del geo_processor - util = Utility() - util.csv_writer(name_num, os.path.join(folder_path, os.path.basename(input_folder) + '.csv')) - del util - # perform the 1st k-d tree for depth_count in range(1, int(maximum_level) + 1): bb_collec, hist, _ = extend_partition(depth_count, out_BB, entire_data, 1) From 8fd03797b8877d6a030d848592480adae10ab9f6 Mon Sep 17 00:00:00 2001 From: adam-t-shaw Date: Fri, 11 Oct 2019 12:02:25 -0700 Subject: [PATCH 04/11] zip handling and missing output exception --- src/mqm/mqm_tool.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/mqm/mqm_tool.py b/src/mqm/mqm_tool.py index 214ebeb..28e88d1 100644 --- a/src/mqm/mqm_tool.py +++ b/src/mqm/mqm_tool.py @@ -123,8 +123,10 @@ def get_argument(): folder_path = os.path.normpath(args.input) if os.path.splitext(folder_path)[1] == '.zip': ext_path = os.path.splitext(folder_path)[0] - ZipFile(folder_path, 'r').extractall('temp') - folder_path = 'temp' + ZipFile(folder_path, 'r').extractall(ext_path) + folder_path = ext_path + if not args.output: + sys.exit('Error: Output argument is required') output_folder = os.path.normpath(args.output) if args.maxCount: @@ -368,13 +370,14 @@ def main(): folder_list.append(input_folder) # iterate through all sub-directories - for sub_folder in folder_list: - directory_creation(folder_path, os.path.join(folder_path, os.path.split(sub_folder)[1]), path, geojson_path) - - # process single sub-folder - process_single_folder(sub_folder, os.path.join(folder_path, os.path.split(sub_folder)[1]), maximum_level, - count_num, grid_percent, max_count, path, geojson_path, flag_val, summary_table, - os.path.split(sub_folder)[1]) + for sub_folder in folder_list: + if '__MACOSX' not in sub_folder: + directory_creation(folder_path, os.path.join(folder_path, os.path.split(sub_folder)[1]), path, geojson_path) + + # process single sub-folder + process_single_folder(sub_folder, os.path.join(folder_path, os.path.split(sub_folder)[1]), maximum_level, + count_num, grid_percent, max_count, path, geojson_path, flag_val, summary_table, + os.path.split(sub_folder)[1]) # write out a summary table util = Utility() From c02da962865879597d97e83256061868ca902357 Mon Sep 17 00:00:00 2001 From: adam-t-shaw Date: Fri, 11 Oct 2019 17:37:35 -0700 Subject: [PATCH 05/11] Adjusting gz handling and handling hidden files --- src/mqm/geo_process.py | 3 ++- src/mqm/mqm_tool.py | 23 +++++++++++++---------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/mqm/geo_process.py b/src/mqm/geo_process.py index a758072..5e4e785 100644 --- a/src/mqm/geo_process.py +++ b/src/mqm/geo_process.py @@ -213,7 +213,8 @@ def bounding_box_process(self): # loop through all geojson files for f in os.listdir(self.folder_path): # load the Geo-json file and ignore other files - if os.path.splitext(os.path.join(self.folder_path, f))[1] == '.geojson': + if (((os.path.splitext(os.path.join(self.folder_path, f))[1] == '.geojson') and not + os.path.join(self.folder_path, f).startswith('.'))): if len(os.path.splitext(f)[0].split('-')) == 3: # pull out this function name_num_list.append([os.path.splitext(f)[0].split('-')[0], int(os.path.splitext(f)[0].split('-')[2])]) diff --git a/src/mqm/mqm_tool.py b/src/mqm/mqm_tool.py index 28e88d1..21dc34f 100644 --- a/src/mqm/mqm_tool.py +++ b/src/mqm/mqm_tool.py @@ -121,10 +121,14 @@ def get_argument(): path = 'histogram' geojson_path = 'geojson' folder_path = os.path.normpath(args.input) - if os.path.splitext(folder_path)[1] == '.zip': - ext_path = os.path.splitext(folder_path)[0] - ZipFile(folder_path, 'r').extractall(ext_path) - folder_path = ext_path + + # Check input file structure to ensure only geojson files are stored in each subdirectory of the input directory + for item in Path(folder_path).glob('*/*'): + print(item.name.split('.')) + if 'geojson' not in item.name.split('.') and not item.name.startswith('.'): + sys.exit('Error: Directory must have the format Directory>Country>GeoJSONFile.geojson') + + # Require output path if not args.output: sys.exit('Error: Output argument is required') output_folder = os.path.normpath(args.output) @@ -371,13 +375,12 @@ def main(): # iterate through all sub-directories for sub_folder in folder_list: - if '__MACOSX' not in sub_folder: - directory_creation(folder_path, os.path.join(folder_path, os.path.split(sub_folder)[1]), path, geojson_path) + directory_creation(folder_path, os.path.join(folder_path, os.path.split(sub_folder)[1]), path, geojson_path) - # process single sub-folder - process_single_folder(sub_folder, os.path.join(folder_path, os.path.split(sub_folder)[1]), maximum_level, - count_num, grid_percent, max_count, path, geojson_path, flag_val, summary_table, - os.path.split(sub_folder)[1]) + # process single sub-folder + process_single_folder(sub_folder, os.path.join(folder_path, os.path.split(sub_folder)[1]), maximum_level, + count_num, grid_percent, max_count, path, geojson_path, flag_val, summary_table, + os.path.split(sub_folder)[1]) # write out a summary table util = Utility() From 9876642c8013a2a27e8288e817ce50897c955b53 Mon Sep 17 00:00:00 2001 From: adam-t-shaw Date: Sat, 12 Oct 2019 10:51:59 -0700 Subject: [PATCH 06/11] gz handling fixes --- src/mqm/geo_process.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/mqm/geo_process.py b/src/mqm/geo_process.py index 5e4e785..c92930c 100644 --- a/src/mqm/geo_process.py +++ b/src/mqm/geo_process.py @@ -212,15 +212,20 @@ def bounding_box_process(self): # loop through all geojson files for f in os.listdir(self.folder_path): + print(f) # load the Geo-json file and ignore other files - if (((os.path.splitext(os.path.join(self.folder_path, f))[1] == '.geojson') and not - os.path.join(self.folder_path, f).startswith('.'))): + if (('geojson' in (os.path.join(self.folder_path, f)).split('.')) and not + ((f)).startswith('.')): if len(os.path.splitext(f)[0].split('-')) == 3: # pull out this function - name_num_list.append([os.path.splitext(f)[0].split('-')[0], int(os.path.splitext(f)[0].split('-')[2])]) + if os.path.splitext(os.path.join(self.folder_path, f))[1] == '.gz': + name_num_list.append([os.path.splitext(f)[0].split('-')[0], int((os.path.splitext(f)[0].split('-')[2]).split('.')[0])]) + else: + name_num_list.append([os.path.splitext(f)[0].split('-')[0], (int(os.path.splitext(f)[0].split('-')[2]))]) + print(name_num_list) # open geojson files - if os.path.splitext(os.path.join(self.folder_path, f))[-1] == '.gz': - new_f = gzip.open(os.path.join(self.folder_path, f), encoding='utf-8') + if os.path.splitext(os.path.join(self.folder_path, f))[1] == '.gz': + new_f = gzip.open(os.path.join(self.folder_path, f)) else: new_f = open(os.path.join(self.folder_path, f), encoding='utf-8') data = json.load(new_f) From a2d7b6fad64b9ab23915449d13864f354b036cfb Mon Sep 17 00:00:00 2001 From: adam-t-shaw Date: Mon, 14 Oct 2019 14:37:53 -0700 Subject: [PATCH 07/11] Clean up --- src/mqm/geo_process.py | 3 +-- src/mqm/mqm_tool.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/mqm/geo_process.py b/src/mqm/geo_process.py index c92930c..d1077a8 100644 --- a/src/mqm/geo_process.py +++ b/src/mqm/geo_process.py @@ -214,8 +214,7 @@ def bounding_box_process(self): for f in os.listdir(self.folder_path): print(f) # load the Geo-json file and ignore other files - if (('geojson' in (os.path.join(self.folder_path, f)).split('.')) and not - ((f)).startswith('.')): + if ('geojson' in (os.path.join(self.folder_path, f)).split('.')) and not f.startswith('.'): if len(os.path.splitext(f)[0].split('-')) == 3: # pull out this function if os.path.splitext(os.path.join(self.folder_path, f))[1] == '.gz': name_num_list.append([os.path.splitext(f)[0].split('-')[0], int((os.path.splitext(f)[0].split('-')[2]).split('.')[0])]) diff --git a/src/mqm/mqm_tool.py b/src/mqm/mqm_tool.py index 21dc34f..ca01e00 100644 --- a/src/mqm/mqm_tool.py +++ b/src/mqm/mqm_tool.py @@ -124,7 +124,6 @@ def get_argument(): # Check input file structure to ensure only geojson files are stored in each subdirectory of the input directory for item in Path(folder_path).glob('*/*'): - print(item.name.split('.')) if 'geojson' not in item.name.split('.') and not item.name.startswith('.'): sys.exit('Error: Directory must have the format Directory>Country>GeoJSONFile.geojson') From 75687bd0a7f127c98412cc06d88c749d385060f8 Mon Sep 17 00:00:00 2001 From: adam-t-shaw Date: Mon, 14 Oct 2019 14:42:18 -0700 Subject: [PATCH 08/11] Required output path through argparse --- src/mqm/mqm_tool.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/mqm/mqm_tool.py b/src/mqm/mqm_tool.py index ca01e00..413f39d 100644 --- a/src/mqm/mqm_tool.py +++ b/src/mqm/mqm_tool.py @@ -115,7 +115,7 @@ def get_argument(): parser.add_argument('--countNum', type=str, default='10', help='a count value for a stop condition') parser.add_argument('--gridPercent', type=str, default='0.9', help='a grid percentage') parser.add_argument('--maxCount', type=str, default='', help='maximum count to the second k-d tree') - parser.add_argument('--output', type=str, help='path to an output folder') + parser.add_argument('--output', type=str, required=True, help='path to an output folder') args = parser.parse_args() max_count = -1 path = 'histogram' @@ -127,9 +127,6 @@ def get_argument(): if 'geojson' not in item.name.split('.') and not item.name.startswith('.'): sys.exit('Error: Directory must have the format Directory>Country>GeoJSONFile.geojson') - # Require output path - if not args.output: - sys.exit('Error: Output argument is required') output_folder = os.path.normpath(args.output) if args.maxCount: From 4462b9aecf8674c88192e9bd40a7c38ce190e9e9 Mon Sep 17 00:00:00 2001 From: adam-t-shaw Date: Thu, 12 Dec 2019 13:35:59 -0800 Subject: [PATCH 09/11] Clean up --- src/mqm/geo_process.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mqm/geo_process.py b/src/mqm/geo_process.py index d1077a8..72bd340 100644 --- a/src/mqm/geo_process.py +++ b/src/mqm/geo_process.py @@ -212,7 +212,6 @@ def bounding_box_process(self): # loop through all geojson files for f in os.listdir(self.folder_path): - print(f) # load the Geo-json file and ignore other files if ('geojson' in (os.path.join(self.folder_path, f)).split('.')) and not f.startswith('.'): if len(os.path.splitext(f)[0].split('-')) == 3: # pull out this function From d843c868aae626d8962e2d3f2e67e5d3550d9742 Mon Sep 17 00:00:00 2001 From: adam-t-shaw Date: Thu, 12 Dec 2019 17:01:12 -0800 Subject: [PATCH 10/11] Adding recursive logic to geojson search --- src/mqm/geo_process.py | 21 ++++++++++++++------- src/mqm/mqm_tool.py | 6 ------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/mqm/geo_process.py b/src/mqm/geo_process.py index 72bd340..2a77238 100644 --- a/src/mqm/geo_process.py +++ b/src/mqm/geo_process.py @@ -210,16 +210,23 @@ def bounding_box_process(self): end_point = 0 - # loop through all geojson files - for f in os.listdir(self.folder_path): - # load the Geo-json file and ignore other files + # Loop through all geojson files in folder path recursively + for roots, dirs, files in os.walk(self.folder_path): + checks_list = [] + for f in files: + check_name = os.path.splitext(f)[0].split('-')[0] + if not check_name.startswith('.'): + checks_list.append(check_name) + if len(checks_list) != len(set(checks_list)): + sys.exit('Each subdirectory must have only one GeoJSON per Atlas Check and no more than one boundary ' + 'file') + # load the Geo-json file and ignore other files if ('geojson' in (os.path.join(self.folder_path, f)).split('.')) and not f.startswith('.'): if len(os.path.splitext(f)[0].split('-')) == 3: # pull out this function if os.path.splitext(os.path.join(self.folder_path, f))[1] == '.gz': name_num_list.append([os.path.splitext(f)[0].split('-')[0], int((os.path.splitext(f)[0].split('-')[2]).split('.')[0])]) else: name_num_list.append([os.path.splitext(f)[0].split('-')[0], (int(os.path.splitext(f)[0].split('-')[2]))]) - print(name_num_list) # open geojson files if os.path.splitext(os.path.join(self.folder_path, f))[1] == '.gz': @@ -246,7 +253,7 @@ def bounding_box_process(self): # ============================== geometry_bounding_box_list.append(geometry_bounding_box) self.output_data += tmp_geometry_collec - + else: # discard a feature without feature properties if len(data['features'][geometry_index]['properties']['feature_properties']) != 0: @@ -259,11 +266,11 @@ def bounding_box_process(self): data['features'][geometry_index]['properties']['feature_properties'][0]['identifier'], f]) geometry_bounding_box_list.append(geometry_bounding_box) - + # get a file bounding box for given multiple geometry bounding boxes, and add it into folder bounding box list folder_bounding_box_set.append(self.final_bounding_box_generation(geometry_bounding_box_list, 4)) del geometry_bounding_box_list - + # update start point start_point = len(data['features']) diff --git a/src/mqm/mqm_tool.py b/src/mqm/mqm_tool.py index 413f39d..05ea156 100644 --- a/src/mqm/mqm_tool.py +++ b/src/mqm/mqm_tool.py @@ -121,12 +121,6 @@ def get_argument(): path = 'histogram' geojson_path = 'geojson' folder_path = os.path.normpath(args.input) - - # Check input file structure to ensure only geojson files are stored in each subdirectory of the input directory - for item in Path(folder_path).glob('*/*'): - if 'geojson' not in item.name.split('.') and not item.name.startswith('.'): - sys.exit('Error: Directory must have the format Directory>Country>GeoJSONFile.geojson') - output_folder = os.path.normpath(args.output) if args.maxCount: From 17bad0b2bf9ffe0ffb443810642cf3c3c7d947ba Mon Sep 17 00:00:00 2001 From: adam-t-shaw Date: Sun, 12 Jan 2020 10:21:58 -0800 Subject: [PATCH 11/11] Reverting os.walk implementation and single geojson requirement, updating readme --- README.md | 9 ++++++--- src/mqm/geo_process.py | 15 +++------------ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 085ee84..27f9303 100644 --- a/README.md +++ b/README.md @@ -19,12 +19,15 @@ pipenv shell **To run the program successfully, please follow steps:**
-1. create an arbitrary directory storing all sub-directories, and put all input sub-directories into it.
+1. Create an arbitrary directory storing containing either: + - Countrified subdirectories, each containing the relevant atlas check geometries and optional boundary + - A set of atlas checks files and optional boundary without separate subdirectories (output will not be divided into subdirectories)
-2. run the program through applying
+2. Run the program through applying:
``` -python3 -m mqm --folderPath [a absolute folder path] --maxDepth [maximum tree depth (default = 10)] +python3 -m mqm --input [input directory containing atlas check geometries and boundary files] +--output [output directory to store results] --maxDepth [maximum tree depth (default = 10)] --countNum [a count number (default = 10)] --gridPercent [a grid percentage (default = 0.9)] --maxCount [maximum count to the second k-d tree] diff --git a/src/mqm/geo_process.py b/src/mqm/geo_process.py index 2a77238..1a55e05 100644 --- a/src/mqm/geo_process.py +++ b/src/mqm/geo_process.py @@ -209,18 +209,9 @@ def bounding_box_process(self): start_point = 0 end_point = 0 - - # Loop through all geojson files in folder path recursively - for roots, dirs, files in os.walk(self.folder_path): - checks_list = [] - for f in files: - check_name = os.path.splitext(f)[0].split('-')[0] - if not check_name.startswith('.'): - checks_list.append(check_name) - if len(checks_list) != len(set(checks_list)): - sys.exit('Each subdirectory must have only one GeoJSON per Atlas Check and no more than one boundary ' - 'file') - # load the Geo-json file and ignore other files + # loop through all geojson files + for f in os.listdir(self.folder_path): + # load the Geo-json file and ignore other files if ('geojson' in (os.path.join(self.folder_path, f)).split('.')) and not f.startswith('.'): if len(os.path.splitext(f)[0].split('-')) == 3: # pull out this function if os.path.splitext(os.path.join(self.folder_path, f))[1] == '.gz':