From bdaee3e2dce1ffa1f52df3b573fd941af148e0bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=2E=20Ram=C3=ADrez?= <32576858+JMasr@users.noreply.github.com> Date: Fri, 23 Sep 2022 04:20:45 +0200 Subject: [PATCH 1/3] Update extract_data.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit refactors the **extract_data.py** using only Python's lbs. This is useful for Windows users and a little more Pythoni 👍 --- extract_data.py | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/extract_data.py b/extract_data.py index ffe0e530..b8ec104a 100644 --- a/extract_data.py +++ b/extract_data.py @@ -1,32 +1,45 @@ import os -import sys -import subprocess -import numpy as np +import shutil import glob -import json -import pandas as pd +import tarfile ''' This script creates a folder "Extracted_data" inside which it extracts all the wav files in the directories date-wise ''' -coswara_data_dir = os.path.abspath('.') # Local Path of iiscleap/Coswara-Data Repo -extracted_data_dir = os.path.join(coswara_data_dir, 'Extracted_data') +coswara_data_dir = os.path.abspath('.') # Local Path of iiscleap/Coswara-Data Repo +extracted_data_dir = os.path.join(coswara_data_dir, 'extracted_data') if not os.path.exists(coswara_data_dir): - raise("Check the Coswara dataset directory!") + raise ("Check the Coswara dataset directory!") if not os.path.exists(extracted_data_dir): - os.makedirs(extracted_data_dir) # Creates the Extracted_data folder if it doesn't exist + os.makedirs(extracted_data_dir) # Creates the Extracted_data folder if it doesn't exist -dirs_extracted = set(map(os.path.basename,glob.glob('{}/202*'.format(extracted_data_dir)))) -dirs_all = set(map(os.path.basename,glob.glob('{}/202*'.format(coswara_data_dir)))) +dirs_extracted = set(map(os.path.basename, glob.glob('{}/202*'.format(extracted_data_dir)))) +dirs_all = set(map(os.path.basename, glob.glob('{}/202*'.format(coswara_data_dir)))) dirs_to_extract = list(set(dirs_all) - dirs_extracted) +all_file_temp = os.path.join(extracted_data_dir, "temp.tar.gz") -for d in dirs_to_extract: - p = subprocess.Popen('cat {}/{}/*.tar.gz.* |tar -xvz -C {}/'.format(coswara_data_dir, d, extracted_data_dir), shell=True) - p.wait() +def extract(infile: str): + # concatenate all the *tar.gz* files + with open(all_file_temp, 'wb') as wfp: + for fn in infile: + with open(fn, 'rb') as rfp: + shutil.copyfileobj(rfp, wfp) + + # extract the all-in-one file + tar = tarfile.open(all_file_temp, "r:gz") + tar.extractall(path=extracted_data_dir) + tar.close() + + +for d in dirs_to_extract: + dir_ = os.listdir(os.path.join(coswara_data_dir, d)) + part_files = [os.path.join(dir_, file) for file in dir_ if 'tar.gz' in file] + extract(part_files) + os.remove(os.path.join(extracted_data_dir, "temp.tar.gz")) print("Extraction process complete!") From 8e9dc3593b1d6915186dc663a19b2b1dc977566c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=2E=20Ram=C3=ADrez?= Date: Fri, 23 Sep 2022 03:06:51 +0200 Subject: [PATCH 2/3] Fix a bug --- extract_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extract_data.py b/extract_data.py index b8ec104a..2b4fc6fe 100644 --- a/extract_data.py +++ b/extract_data.py @@ -37,8 +37,8 @@ def extract(infile: str): for d in dirs_to_extract: - dir_ = os.listdir(os.path.join(coswara_data_dir, d)) - part_files = [os.path.join(dir_, file) for file in dir_ if 'tar.gz' in file] + dir_ = os.path.join(coswara_data_dir, d) + part_files = [os.path.join(dir_, file) for file in os.listdir(dir_) if 'tar.gz' in file] extract(part_files) os.remove(os.path.join(extracted_data_dir, "temp.tar.gz")) From 9e3dac0f2314095a79a13707a7256c01a9e515b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=2E=20Ram=C3=ADrez?= <32576858+JMasr@users.noreply.github.com> Date: Wed, 28 Sep 2022 15:03:33 +0200 Subject: [PATCH 3/3] Update extract_data.py Adding a 'sorting' to concatenate the files in order. Adding a printable notification for each file. Windows & Linux tested, --- extract_data.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/extract_data.py b/extract_data.py index 2b4fc6fe..4ec20e92 100644 --- a/extract_data.py +++ b/extract_data.py @@ -8,32 +8,34 @@ ''' coswara_data_dir = os.path.abspath('.') # Local Path of iiscleap/Coswara-Data Repo -extracted_data_dir = os.path.join(coswara_data_dir, 'extracted_data') +extracted_data_dir = os.path.join(coswara_data_dir, 'Extracted_data') if not os.path.exists(coswara_data_dir): - raise ("Check the Coswara dataset directory!") + raise "Check the Coswara dataset directory!" if not os.path.exists(extracted_data_dir): os.makedirs(extracted_data_dir) # Creates the Extracted_data folder if it doesn't exist -dirs_extracted = set(map(os.path.basename, glob.glob('{}/202*'.format(extracted_data_dir)))) -dirs_all = set(map(os.path.basename, glob.glob('{}/202*'.format(coswara_data_dir)))) +dirs_extracted = set(map(os.path.basename, glob.glob(f'{extracted_data_dir}/202*'))) +dirs_all = set(map(os.path.basename, glob.glob(f'{coswara_data_dir}/202*'))) dirs_to_extract = list(set(dirs_all) - dirs_extracted) all_file_temp = os.path.join(extracted_data_dir, "temp.tar.gz") -def extract(infile: str): +def extract(infile: list): + print(f'Extracting {infile[0].split(".a")[0]}') # concatenate all the *tar.gz* files with open(all_file_temp, 'wb') as wfp: + infile.sort() for fn in infile: with open(fn, 'rb') as rfp: shutil.copyfileobj(rfp, wfp) - # extract the all-in-one file - tar = tarfile.open(all_file_temp, "r:gz") - tar.extractall(path=extracted_data_dir) - tar.close() + # extract the all-in-one file + tar = tarfile.open(all_file_temp, "r:gz") + tar.extractall(path=extracted_data_dir) + tar.close() for d in dirs_to_extract: