From 413d9d67017210163ed1c308a5b91439b7526a17 Mon Sep 17 00:00:00 2001 From: Philip Mateescu Date: Fri, 4 Sep 2020 20:44:43 -0500 Subject: [PATCH 1/6] allows specifying dump files individually fixes #112 --- README.md | 20 ++++++++++++++- discogsxml2db/exporter.py | 53 +++++++++++++++++++++++++++------------ run.py | 10 +++++--- 3 files changed, 63 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index e74e501..1457e77 100644 --- a/README.md +++ b/README.md @@ -62,23 +62,41 @@ $ sha256sum -c discogs_*_CHECKSUM.txt Run `run.py` to convert the dump files to csv. +There are two run modes: + +1. You can point it to a directory where the discogs dump files are + and use one or multiple `--export` options to indicate which files to process: + ```sh # ensure the virtual environment is active (.discogsenv) $ python3 run.py \ --bz2 \ # compresses resulting csv files --apicounts \ # provides more accurate progress counts --export artist --export label --export master --export release \ + --output csv-dir # folder where to output the csv files dump-dir \ # folder where the data dumps are - csv-dir # folder where to output the csv files +``` + +2. You can specify the individual files instead: + +```sh +# ensure the virtual environment is active +(.discogsenv) $ python3 run.py \ + --bz2 \ # compresses resulting csv files + --apicounts \ # provides more accurate progress counts + --output csv-dir # folder where to output the csv files + path/to/discogs_20200806_artist.xml.gz path/to/discogs_20200806_labels.xml.gz ``` `run.py` takes the following arguments: - `--export`: the types of dump files to export: "artist", "label", "master", "release. It matches the names of the dump files, e.g. "discogs_20200806_*artist*s.xml.gz" + Not needed if the individual files are specified. - `--bz2`: Compresses output csv files using bz2 compression library. - `--limit=`: Limits export to some number of entities - `--apicounts`: Makes progress report more accurate by getting total amounts from Discogs API. +- `--output` : the folder where to store the csv files; default it current directory The exporter provides progress information in real time: diff --git a/discogsxml2db/exporter.py b/discogsxml2db/exporter.py index 9a23c7c..a5dfe67 100644 --- a/discogsxml2db/exporter.py +++ b/discogsxml2db/exporter.py @@ -37,7 +37,7 @@ def _write_rows(writer, entity, name): class EntityCsvExporter(object): """Read a Discogs dump XML file and exports SQL table records as CSV. """ - def __init__(self, entity, in_dir, out_dir, + def __init__(self, entity, in_file_or_dir, out_dir, limit=None, bz2=True, dry_run=False, debug=False, max_hint=None, verbose=False): self.entity = entity @@ -45,8 +45,11 @@ def __init__(self, entity, in_dir, out_dir, self.max_hint = max_hint self.verbose = verbose - lookup = 'discogs_[0-9]*_{}s.xml*'.format(entity) - self.pattern = os.path.join(in_dir, lookup) + if os.path.isfile(in_file_or_dir): + self.pattern = in_file_or_dir + else: + lookup = 'discogs_[0-9]*_{}s.xml*'.format(entity) + self.pattern = os.path.join(in_file_or_dir, lookup) # where and how the exporter will write to self.out_dir = out_dir @@ -287,8 +290,7 @@ def write_track_artists(self, writer, release): def main(arguments): - in_base = arguments['INPUT'] - out_base = arguments['OUTPUT'] or '.' + out_base = arguments['--output'] or '.' limit = int(arguments['--limit']) if arguments['--limit'] else None bz2_on = arguments['--bz2'] debug = arguments['--debug'] @@ -312,14 +314,33 @@ def main(arguments): except Exception: pass - for entity in arguments['--export']: - expected_count = rough_counts['{}s'.format(entity)] - exporter = _exporters[entity]( - in_base, - out_base, - limit=limit, - bz2=bz2_on, - debug=debug, - max_hint=min(expected_count, limit or expected_count), - dry_run=dry_run) - exporter.export() + if arguments['INPUT_DIR']: + # use --export to select the entities + in_base = arguments['INPUT_DIR'] + for entity in arguments['--export']: + expected_count = rough_counts['{}s'.format(entity)] + exporter = _exporters[entity]( + in_base, + out_base, + limit=limit, + bz2=bz2_on, + debug=debug, + max_hint=min(expected_count, limit or expected_count), + dry_run=dry_run) + exporter.export() + elif arguments[""]: + for in_file in arguments[""]: + for entity in _exporters: + # discogs files are named discogs_{date}_{entity}s.xml + if f"_{entity}" in in_file: + expected_count = rough_counts['{}s'.format(entity)] + exporter = _exporters[entity]( + in_file, + out_base, + limit=limit, + bz2=bz2_on, + debug=debug, + max_hint=min(expected_count, limit or expected_count), + dry_run=dry_run) + exporter.export() + break diff --git a/run.py b/run.py index 7f88f04..01369a9 100644 --- a/run.py +++ b/run.py @@ -1,15 +1,17 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- """Usage: - run.py [--bz2] [--dry-run] [--limit=] [--debug] [--apicounts] INPUT [OUTPUT] [--export=]... + run.py [--bz2] [--dry-run] [--limit=] [--debug] [--apicounts] [--output=] INPUT_DIR [--export=]... + run.py [--bz2] [--dry-run] [--limit=] [--debug] [--apicounts] [--output=] ... Options: --bz2 Compress output files using bz2 compression library. --limit= Limit export to some number of entities - --export= Limit export to some entities (repeatable) + --export= Limit export to some entities (repeatable). + Entity is one of: artist, label, master, release. --debug Turn on debugging prints --apicounts Check entities counts with Discogs API - --dry-run Do not write + --dry-run Do not write csv files. """ import sys @@ -20,4 +22,6 @@ if __name__ == '__main__': arguments = docopt(__doc__, version='Discogs-to-SQL exporter') + if arguments["--debug"]: + print(arguments) sys.exit(main(arguments)) From e56981a8f2b028ffe8e46a1d15edb0ab91c53c89 Mon Sep 17 00:00:00 2001 From: Philip Mateescu Date: Fri, 4 Sep 2020 20:48:45 -0500 Subject: [PATCH 2/6] updates documentation with --output paramter --- run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/run.py b/run.py index 01369a9..4833338 100644 --- a/run.py +++ b/run.py @@ -6,12 +6,13 @@ Options: --bz2 Compress output files using bz2 compression library. - --limit= Limit export to some number of entities + --limit= Limit export to some number of entities (all otherwise) --export= Limit export to some entities (repeatable). Entity is one of: artist, label, master, release. --debug Turn on debugging prints --apicounts Check entities counts with Discogs API --dry-run Do not write csv files. + --output Where to write the csv files. Defaults to current dir. """ import sys From 9c7a48b73042f26cd9e20ca6bf6fb06d64c0c6d7 Mon Sep 17 00:00:00 2001 From: Philip Mateescu Date: Fri, 4 Sep 2020 20:49:03 -0500 Subject: [PATCH 3/6] updates test parameters passed to main test method --- tests/test_extract.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_extract.py b/tests/test_extract.py index af2532e..285c33f 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -76,8 +76,8 @@ def _check_counts(self, entity, tmp_path): # - export=label arguments = { - "INPUT": self._samples_folder, - "OUTPUT": tmp_path, + "INPUT_DIR": self._samples_folder, + "--output": tmp_path, "--export": [entity], "--limit": None, "--bz2": False, From 74bdb67434207776e4efa9cd45d910a15c0048cd Mon Sep 17 00:00:00 2001 From: Philip Mateescu Date: Fri, 4 Sep 2020 21:00:57 -0500 Subject: [PATCH 4/6] documents output argument --- run.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/run.py b/run.py index 4833338..76f7277 100644 --- a/run.py +++ b/run.py @@ -1,8 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- """Usage: - run.py [--bz2] [--dry-run] [--limit=] [--debug] [--apicounts] [--output=] INPUT_DIR [--export=]... - run.py [--bz2] [--dry-run] [--limit=] [--debug] [--apicounts] [--output=] ... + run.py [--bz2] [--dry-run] [--limit=] [--debug] [--apicounts] [--output=] ... + run.py [--bz2] [--dry-run] [--limit=] [--debug] [--apicounts] [--output=] INPUT_DIR [--export=]... Options: --bz2 Compress output files using bz2 compression library. @@ -12,7 +12,7 @@ --debug Turn on debugging prints --apicounts Check entities counts with Discogs API --dry-run Do not write csv files. - --output Where to write the csv files. Defaults to current dir. + --output= Where to write the csv files. Defaults to current dir. """ import sys From 02b419e81b24085acf886325e6bdfcd51df1ca3c Mon Sep 17 00:00:00 2001 From: Philip Mateescu Date: Fri, 4 Sep 2020 21:01:41 -0500 Subject: [PATCH 5/6] handles only one file specified if only one file is specified it goes in INPUT_DIR --- discogsxml2db/exporter.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/discogsxml2db/exporter.py b/discogsxml2db/exporter.py index a5dfe67..72868cf 100644 --- a/discogsxml2db/exporter.py +++ b/discogsxml2db/exporter.py @@ -314,7 +314,7 @@ def main(arguments): except Exception: pass - if arguments['INPUT_DIR']: + if arguments["INPUT_DIR"] and os.path.isdir(arguments["INPUT_DIR"]): # use --export to select the entities in_base = arguments['INPUT_DIR'] for entity in arguments['--export']: @@ -328,8 +328,13 @@ def main(arguments): max_hint=min(expected_count, limit or expected_count), dry_run=dry_run) exporter.export() - elif arguments[""]: - for in_file in arguments[""]: + elif arguments[""] or os.path.isfile(arguments["INPUT_DIR"]): + files = [] + if arguments[""]: + files = arguments[""] + else: + files = [ arguments["INPUT_DIR"] ] + for in_file in files: for entity in _exporters: # discogs files are named discogs_{date}_{entity}s.xml if f"_{entity}" in in_file: From 654a7de1370d2deee792d973c3cc7d01bc2a158e Mon Sep 17 00:00:00 2001 From: Philip Mateescu Date: Fri, 4 Sep 2020 21:02:59 -0500 Subject: [PATCH 6/6] fixes linting error --- discogsxml2db/exporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discogsxml2db/exporter.py b/discogsxml2db/exporter.py index 72868cf..f62f4b4 100644 --- a/discogsxml2db/exporter.py +++ b/discogsxml2db/exporter.py @@ -333,7 +333,7 @@ def main(arguments): if arguments[""]: files = arguments[""] else: - files = [ arguments["INPUT_DIR"] ] + files = [arguments["INPUT_DIR"]] for in_file in files: for entity in _exporters: # discogs files are named discogs_{date}_{entity}s.xml